All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.marino_serna.parallel_tool.ParallelTool.scala Maven / Gradle / Ivy

package com.github.marino_serna.parallel_tool

import java.lang.reflect.Method

import org.apache.log4j.Logger
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.storage.StorageLevel

import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration._
import scala.concurrent.{Await, Future, Promise}
import scala.language.{implicitConversions, postfixOps}
import scala.reflect.runtime.universe._
import scala.util.Failure

/**
  * @author Marino Serna Sanmamed
  * @param spark SparkSession
  * @param storage will be use to read and write
  * @param logs print logs for the execution, including duration of the functions
  * @param futuresWithoutDependencies allow to use or not use futures when calling parallelNoDependencies, both solutions are valid
  */
class ParallelTool (spark: SparkSession, storage:Storage, logs:Boolean = false, futuresWithoutDependencies:Boolean=true){
  private val logger: Logger = Logger.getLogger(getClass.getName)
  private var futures:Map[String,Future[DataFrame]] = Map[String, Future[DataFrame]]()
  private var promises:Map[String,Promise[DataFrame]] = Map[String, Promise[DataFrame]]()
  private var futuresFlags:Map[String,Future[Boolean]] = Map[String, Future[Boolean]]()
  private var promisesFlags:Map[String,Promise[Boolean]] = Map[String, Promise[Boolean]]()
  private var functionsToExecute:List[String] = Nil //Used for logs


  /**
    * Inicialize the structures of futures and promises that will be required during the execution.
    * @param functionNames List of names that will be executed
    */
  private def generateFuturesAndPromises(functionNames:List[String]):Unit = {
    def futuresAndPromises(functionNames: List[String]): (Map[String,Future[DataFrame]], Map[String,Promise[DataFrame]],
      Map[String,Future[Boolean]], Map[String,Promise[Boolean]]) = {
      functionNames match {
        case Nil => (Map[String, Future[DataFrame]](), Map[String, Promise[DataFrame]](),
          Map[String, Future[Boolean]](), Map[String, Promise[Boolean]]())
        case _ =>
          val remaining = futuresAndPromises(functionNames.tail)
          val currentFunction = functionNames.head
          val currentPromise:Promise[DataFrame] = Promise[DataFrame]
          val currentFuture = currentPromise.future
          val currentPromiseFlag:Promise[Boolean] = Promise[Boolean]
          val currentFutureFlag = currentPromiseFlag.future
          (remaining._1 + (currentFunction -> currentFuture), remaining._2 + (currentFunction -> currentPromise),
            remaining._3 + (currentFunction -> currentFutureFlag), remaining._4 + (currentFunction -> currentPromiseFlag))
      }
    }
    val mapsReadyToUse = futuresAndPromises(functionNames)
    futures  = mapsReadyToUse._1
    promises = mapsReadyToUse._2
    futuresFlags  = mapsReadyToUse._3
    promisesFlags = mapsReadyToUse._4
  }

  private case class Caller[T>:Null<:AnyRef](klass:T) {

    /**
      * Retrieve the priority from the annotations.
      *
      * Useful to find the critical path of execution and also to optimize the executions by start with the longer tasks
      * @param methodName Name of the Method of which the value of its priority is requested
      * @param args arguments that this method have. Required because function overloading is allowed
      * @return
      */
    def priority(methodName:String,args:AnyRef*):Long={
      def argTypes = args.map(_.getClass)
      def method = klass.getClass.getMethod(methodName, argTypes:_*)
      method.getDeclaredAnnotations.toList.map(annotation => {
        val time:Long = annotation match {
          case priority:PriorityExecution => priority.expectedExecutionTime()
          case _=>0
        }
        time
      }).foldLeft(0L)((processed:Long,current:Long) => processed + current)
    }

    /**
      * Retrieve the dependencies from the annotations.
      *
      * Useful to avoid lockups, also to finding the critical path of execution
      * @param methodName Name of the Method of which the value of its dependencies are requested
      * @param args arguments that this method have. Required because function overloading is allowed
      * @return
      */
    def dependencies(methodName:String,args:AnyRef*):List[String]={
      def argTypes = args.map(_.getClass)
      def method = klass.getClass.getMethod(methodName, argTypes:_*)
      dependencies(method)
    }

    def dependencies(method:Method):List[String]={
      method.getDeclaredAnnotations.toList.map {
        case dependence: DependenceOf => dependence.dependencies().toList
        case _ => Nil
      }.foldLeft(Nil:List[String])((processed,current) => processed ::: current)
    }

    /**
      * Hold on until all the dependencies of the method send by parameter are completed.
      * @param method method that is waiting until all its dependencies are finished
      */
    private def waitFinishAnnotations(method:Method):Unit={
      dependencies(method).map(waitFinish)
    }

    /** call one method of a class using reflexion.
      *
      * This method is designed for Spark and will work only for method that return a DataFrame
      *
      * @param methodName Name of the method that will be executed
      * @param args arguments for the method that will be executed
      * @return The output of the Method that was call. Must be of type DataFrame
      */
    def call[T<:Object, A: TypeTag](methodName:String,args:AnyRef*):DataFrame = {
      try {
        if(logs) logger.info(s"Prepare function: $methodName")
        def argTypes = args.map(_.getClass)
        def method = klass.getClass.getMethod(methodName, argTypes:_*)
        val timeOrig = System.currentTimeMillis()

        waitFinishAnnotations(method)
        if(logs) logger.info(s"In call function: $methodName")

        val dfResult:DataFrame = method.invoke(klass,args: _*).asInstanceOf[DataFrame]//.persist(StorageLevel.OFF_HEAP)
        val timeEnd = System.currentTimeMillis()
        val timeMin = (timeEnd-timeOrig) /1000/60.0
        if(logs) logger.info(f"duration function $methodName: $timeMin%.2f min (${timeEnd-timeOrig}ns)")

        finish(method, methodName, dfResult)

        if(logs) logger.info(s"End call function: $methodName")
        dfResult
      } catch {
        case ex:ClassCastException =>
          val message = s"problem with the result of the function $methodName. The function must return a DataFrame\n ${ex.getCause}"
          logger.error(message)
          ex.printStackTrace()
          System.exit(-1)
          throw new Exception(message)
        case ex: Throwable =>
          logger.error(s"ERROR in method $methodName: ${ex.getMessage}")
          ex.printStackTrace()
          System.exit(-1)
          throw ex
      }
    }

    /**
      * Check if the method exist in the class where is been check
      * @param methodName Name of the method that need to be check
      * @param args arguments for the method that will be executed
      * @return
      */
    def exist(methodName:String,args:AnyRef*):Boolean = {
      klass.getClass.getMethods.exists(x => x.getName.equalsIgnoreCase(methodName))
    }

    /**
      * List with all the functions of this class that have as a parameter the ParallelTool class
      * @return list of names of functions
      */
    def getAllFunctions:List[String]={
      klass.getClass.getMethods.toList.filter(function => {
        function.getParameterTypes.length == 1 && function.getParameterTypes.head.getName == "com.github.marino_serna.parallel_tool.ParallelTool"
      }).map(_.getName)
    }
  }
  private implicit def anyRef2callable[T>:Null<:AnyRef](klass:T):Caller[T] = Caller(klass)

  /**
    * Execute all methods provided as a parameter.
    * @param functions map of classes that contain methods that be executed and all the methods of each class that will be executed
    */

  def startApplication[T<:Object](functions:Map[T, List[String]]):Unit ={
    this.generateFuturesAndPromises(functions.values.toList.flatten)

    /**
      * Update the priority of the list of tasks finding the critical path
      *
      * Every method priority (time that require to be executed) will be increase with the time that takes the predecessors.
      *
      * @param sorted methods that are already updated
      * @param unSorted methods that need to be update
      * @param notPossible number of iterations without changes, to avoid lockups
      * @return List of method with the priority update.
      */
    def updatePriority(sorted:List[(T,String,List[String],Long)],unSorted:List[(T,String,List[String],Long)],
                       notPossible:Int):List[(T,String,List[String],Long)]={
      unSorted match {
        case Nil => sorted
        case head :: tail =>
          val (_,functionName,dependencies,priority):(T,String,List[String],Long) = head
          val irPadre = !unSorted.exists(methodUnSorted => methodUnSorted._3.contains(functionName))
          val currentElement:List[(T,String,List[String],Long)] = head :: Nil
          if(irPadre){
            val updatedPriority = tail.map(x => (x._1,x._2,x._3, if(dependencies.contains(x._2)) x._4 + priority else x._4 ))
            updatePriority(sorted ::: currentElement, updatedPriority,0)
          }else{
            if(notPossible > unSorted.size){
              val error = s"Found a loop with the dependencies: ${unSorted.map(x=>x._2).mkString(" | ")} while updating the priority"
              logger.error(error)
              throw new Exception(error)
            }else{
              updatePriority(sorted, tail ::: currentElement, notPossible + 1)
            }
          }
      }
    }

    /**
      * Sort the functions to avoid lockups and optimice execution time.
      *
      * @param sorted List of methods already sorted
      * @param unSorted List of methods that need to be sorted
      * @param notPossible number of iterations without changes, to avoid lockups
      * @return List of method sorted
      */
    def sortListOfFunctions(sorted:List[(T,String,List[String])],unSorted:List[(T,String,List[String])],notPossible:Int):List[(T,String,List[String])]={
      unSorted match {
        case Nil => sorted
        case remaining :List[(T,String,List[String])] =>
          val (_,_,dependencies):(T,String,List[String]) = remaining.head
          val dependenciesProcessed = dependencies.forall(dependency => sorted.map(x => x._2).contains(dependency))
          val currentElement:List[(T,String,List[String])] = unSorted.head :: Nil
          if(dependenciesProcessed){
            sortListOfFunctions(sorted ::: currentElement, unSorted.tail,0)
          }else{
            val dependenciesPending = dependencies.exists(dependency => unSorted.map(x => x._2).contains(dependency))
            if(dependenciesPending){
              if(notPossible > unSorted.size){
                val error = s"Found a loop with the dependencies: ${unSorted.map(x=>x._2).mkString(" | ")} while sorting functions"
                logger.error(error)
                throw new Exception(error)
              }else{
                sortListOfFunctions(sorted, unSorted.tail ::: currentElement, notPossible + 1)
              }
            }else{
              val error = s"Some of the dependencies: ${dependencies.mkString(" | ")} will not be executed," +
                s"Ensure that the data produced by this methods is available"
              logger.debug(error)
              sortListOfFunctions(sorted ::: currentElement, unSorted.tail,0)
            }
          }
      }
    }

    /**
      * @return List of functions that should be executed
      */
    def getListOfFunctions():List[(T,String,List[String])] = {
      val functionsOutOfOrder = functions.keys.par.map(classToExecute => functions.get(classToExecute) match {
        case Some(listOfFunctions) =>
          functionsToExecute = functionsToExecute ::: listOfFunctions
          listOfFunctions.par.map(functionName => (
            classToExecute,
            functionName,
            classToExecute dependencies (functionName, this),
            classToExecute priority (functionName, this)
          )).toList
        case _ =>
          val error = s"No functions were setup for ${classToExecute.getClass.getName}"
          logger.debug(error)
          Nil
      }).foldLeft(Nil:List[(T,String,List[String],Long)])((ready,current) => ready ::: current)
        .map(x => (x._1,x._2,x._3,x._4))

      updatePriority(Nil,functionsOutOfOrder,0)
        .sortBy(x => -x._4)
        .map(x => (x._1,x._2,x._3))
    }

    val fullListFunctions = getListOfFunctions()

    val listOfFunctionsSorted = sortListOfFunctions(fullListFunctions.filter(x=>x._3.isEmpty),fullListFunctions.filter(x=> x._3.nonEmpty),0)

    if(logs) logger.info(s"${listOfFunctionsSorted.size} execution order: ${listOfFunctionsSorted.map(x=>x._2).mkString(" | ")}")
    val thisParallel = this
    var execution:List[Thread] = listOfFunctionsSorted.map(methodToExecute =>{
      new Thread(new Runnable{def run() {methodToExecute._1 call (methodToExecute._2, thisParallel)}})
    })
    execution.foreach(_.start())

    if(logs) logger.info(s"functions to process: ${execution.size} ")
    while(execution.nonEmpty) {
      execution.head.join()
      execution = execution.filter(fut => fut.isAlive)
      if(logs) logger.info(s"remaining functions to process: ${execution.size} ")
    }

    val functionsWithErrors:List[String] = promisesFlags.keySet.filter(functionName => !isNotFail(functionName)).toList

    if(functionsWithErrors.nonEmpty){
      val error = s"Error during the execution of the functions: ${functionsWithErrors.mkString(" | ")}"
      logger.error(error)
      throw new Exception(error)
    }
  }

  /**
    * Execute all methods that are mark to be executed, that is that have as parameter the class ParallelTool.
    *
    * example of use:
    * val storage:MyStorage = new MyStorage
    * val parallelTool = new ParallelTool(spark, storage)
    * val class1 = new Class1()
    * val class2 = new Class2()
    * val class3 = new Class3()
    * parallelTool.startApplication(class1 :: class2 :: class3 :: Nil)
    *
    * @param executionClasses Classes where to look for methods
    */
  def startApplication[T<:Object](executionClasses:List[Object]):Unit = {
    startApplication(executionClasses.map(currentClass => currentClass -> currentClass.getAllFunctions).toMap)
  }

  /**
    * Wait until the method is compleated and check if the execution fail.
    * @param functionName Name of the method to check
    * @return true if the execution is correct, false if fail.
    */
  private def isNotFail(functionName:String):Boolean={
    futuresFlags.get(functionName) match {
      case Some(promise) =>
        if (promise.isCompleted){
          promise.onFailure{
            case exception =>
              val error = s"Error in the function: $functionName fail during the execution. ${exception.getMessage}}"
              exception.printStackTrace()
              logger.error(error)
              throw new Exception(error)
          }
          Await.result(promise, Duration.Inf)
        }else{
          true
        }
      case _ =>
        false
    }
  }

  /**
    * Check if the function has been completed and the result can be use by other functions.
    * @param functionName Name of the function that need to be check
    * @return true when the function is ready
    */
  private def waitFinish(functionName:String):Boolean={
    futuresFlags.get(functionName) match {
      case Some(promise) =>
        if(logs) logger.info(s"in waiting($functionName)")
        val res:Boolean = Await.result(promise, Duration.Inf)
        if(logs) logger.info(s"out waiting($functionName)")
        res
      case _ =>
        if(functionsToExecute.contains(functionName)){
          val error = s"Error with the execution of the function: $functionName, flag is not available"
          logger.error(error)
          throw new Exception(error)
        }else{
          val error = s"The function: $functionName is not scheduled to be executed, " +
            s"Ensure that the data produced by this methods is available." +
            s"The full list of functions to be executed is: ${functionsToExecute.mkString(" | ")}}"
          logger.debug(error)
          true
        }
    }
  }

  /**
    * Check the annotations and return the schema/folder, the table name/file name and the partitions (if required)
    * if a empty list is return means that the result will be store in a temporal location.
    *
    * This values will be interpreted be the implementation of Store, the uses mention above are suggestions.
    * @param method Name of the method to check
    * @return (Schema, name, partitions) :: Nil
    */
  def getSource(method:Method):List[(String,String, List[String])]={
      method.getDeclaredAnnotations.toList.map {
        case schema: Store =>
          if (schema.temporal()) {
            Nil
          } else if (schema.schema().isEmpty || schema.name().isEmpty) {
            Nil
          } else {
            (schema.schema(), schema.name(), schema.partitions().toList) :: Nil
          }
        case _ => Nil
      }.foldLeft(Nil:List[(String,String,List[String])])((accumulated, current) => accumulated ::: current)
  }

  /**
    * Notify that the method has been executed, and store the result.
    *
    * If the difference of time required to read from the disk of the workers and the time required to read from the persistent storage is big
    * you can force to write all the time in the temporal storage and after updating the flags write in the persistent storage.
    *
    * This application has been tested in a environment with regular/slow hard drives in the cluster workers/master,
    * but a pretty fast persistence environment (google cloud storage), if you have fast SSD hard drives in your workers,
    * or your persistent storage is slower than the hard drives used in your cluster, you should test that optimization.
    *
    * Please do not be tempted to use cache to store the DataFrames, instead of StorageLevel.OFF_HEAP,
    * because spark will consider that you are not using that DataFrames and sometimes you will lost it.
    *
    * @param method method that has been executed
    * @param functionName name of the method that has been executed
    * @param value
    */
  private def finish(method: Method, functionName:String, value:DataFrame):Unit={
    getSource(method) match{
      case Nil =>
        value.persist(StorageLevel.OFF_HEAP)
      case (schemaName,tableName, partitions) :: _ =>
        storage.write(schemaName, tableName, value, partitions)
    }
    def error():Unit={
      // this could be and error, but also happens in a correct execution, like when using parallelNoDependencies
      val error = s"One function is trying write a value in the name of a function ($functionName) that is not plan to be executed, " +
        s"please review the list of functions to execute and the functions that addData to ($functionName)"
      logger.debug(error)
    }

    promises.get(functionName) match {
      case Some(promise) =>
        promise success value
      case _ =>
        error()
    }
    promisesFlags.get(functionName) match {
      case Some(promise) =>
        promise success true
      case _ =>
        error()
    }
  }

  /**
    * Once one method is completed, the future is completed. using this method is possible to retrieve the result of the completed method.
    * @param methodName Method name of which the output is requested
    * @param klass class where the method can be found
    * @return DataFrame with the result of the method
    */
  def get(methodName:String, klass:AnyRef):DataFrame = {

    def getParallelMethod():Method={
      def getParallelMethod(args:AnyRef*):Method={
        def argTypes = args.map(_.getClass)
        klass.getClass.getMethod(methodName, argTypes:_*)
      }
      val thisParallel = this
      getParallelMethod(thisParallel)
    }

    if(logs) logger.info(s"get($methodName, ${klass.getClass.getName})")

    def method = getParallelMethod()
    waitFinish(methodName)
    val timeOrig = System.currentTimeMillis()
    val finalRes = try {
      getSource(method) match{
        case Nil =>
          futures.get(methodName) match {
            case Some(promise) =>
              Await.result(promise, Duration.Inf)
            case _ =>
              val error = s"The function: $methodName need to be executed, please review the list of functions" +
                s" to execute: ${functionsToExecute.mkString(" | ")}}"
              logger.error(error)
              throw new Exception(error)
          }
        case (schemaName,tableName, _) :: _ => storage.read(schemaName,tableName)
      }
    }catch{
      case ex:Throwable =>
        if(logs) logger.error(f"catch from disk get($methodName)")
        ex.printStackTrace()
        throw ex
    }
    val timeEnd = System.currentTimeMillis()
    val timeMin = (timeEnd-timeOrig) /1000/60.0
    if(logs) logger.info(f"duration get($methodName): $timeMin%.2f min (${timeEnd-timeOrig}ns)")
    finalRes
  }

  /**
    * This Method allow to execute in parallel a list of methods without dependencies.
    *
    * This logic is far simpler and works great when inside one method is required to process a list of methods and use the results only in that method.
    * The output of this methods has been tested in conditions where is possible to fit the DataFrames in memory,
    * the use of masibe DataFrames in relation with the cluster size, could requires more testing.
    *
    * Is possible to use bot parallel systems at the same time, in fact is recommended.
    *
    * I recommend to use the default configuration, without futures, in my experience works faster, but if in some scenarios happen to be different
    * the solution with futures is also implemented.
    *
    * Example of use:
    *
    * val functionsToExecute =
    * ("function1", parameter1a :: Nil) ::
    * ("function2", parameter2a :: parameter2b :: parameter2c :: Nil) ::
    * ("function3", parameter3a :: parameter3b :: Nil) ::
    * Nil
    * val (resultDF1 ::
    * resultDF2 ::
    * resultDF3 ::
    * _ ) = parallelTool.parallelNoDependencies(this, functionsToExecute)
    *
    * @param classToExecute class that contains the methods that need to be executed.
    * @param functions list of functions that will be executed
    * @return list of DataFrames containing all the results
    */
  def parallelNoDependencies[T<:Object, A: TypeTag](classToExecute:T, functions:List[(String,List[AnyRef])]):List[DataFrame] ={
    val dfResults = if(futuresWithoutDependencies){
      if(logs) logger.info(s"parallelNoDependenciesWithoutFutures: ${functions.mkString(" | ")}")
      parallelNoDependenciesWithoutFutures(classToExecute, functions)
    }else{
      if(logs) logger.info(s"parallelNoDependenciesWithFutures: ${functions.mkString(" | ")}")
      parallelNoDependenciesWithFutures(classToExecute, functions)
    }
    if(logs) logger.info(s"End parallelNoDependencies : ${functions.mkString(" | ")}")
    dfResults
  }

  private def parallelNoDependenciesWithFutures[T<:Object, A: TypeTag](classToExecute:T, functions:List[(String,List[AnyRef])]):List[DataFrame] = {
    val futureList:List[(String,Future[DataFrame])] = functions.par.map( toExecute =>{
      val futureOfFunction:(String,Future[DataFrame]) = toExecute match{
        case (functionName,parameters) =>
          val f: Future[DataFrame] = Future {
            classToExecute call (functionName, parameters:_*)
          }
          (functionName,f)
        case _ =>
          logger.error("problem with functionName and parameters")
          null
      }
      futureOfFunction }

    ).filter(_ != null).toList

    val resultDF = futureList.par.map {
      case (executedFunction, futureDF: Future[DataFrame]) =>
        val resultFunction: DataFrame = Await.result(futureDF, Duration.Inf)
        futureDF.onComplete {
          case Failure(error) => logger.error(s"An error has occurred with the function $executedFunction: ${error.getMessage}")
          case _ =>
        }
        resultFunction
      case _ => null
    }.filter(_ != null).toList

    if(resultDF.length != functions.length){
      val error = s"${resultDF.length} != ${functions.length} One of the method is not returning a DataFrame  ${functions.map(x => x._1).mkString(" | ")}"
      logger.error(error)
      throw new Exception(error)
    }
    resultDF
  }

  private def parallelNoDependenciesWithoutFutures[T<:Object, A: TypeTag](classToExecute:T, functions:List[(String,List[AnyRef])]):List[DataFrame] ={
    val resultDF:List[DataFrame] = functions.par.map {
      case (functionName, parameters) => classToExecute call(functionName, parameters: _*)
      case _ =>
        logger.error("problem with functionName and parameters")
        null
    }.filter(_ != null).toList

    if(resultDF.length != functions.length){
      val error = s"${resultDF.length} != ${functions.length} One of the method is not returning a DataFrame  ${functions.map(x => x._1).mkString(" | ")}"
      logger.error(error)
      throw new Exception(error)
    }
    resultDF
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy