com.intel.analytics.bigdl.optim.ParallelOptimizer.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of bigdl-SPARK_3.0 Show documentation
The newest version!
/*
 * Copyright 2016 The BigDL Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.intel.analytics.bigdl.optim

import com.intel.analytics.bigdl.dataset._
import com.intel.analytics.bigdl.models.utils.ModelBroadcast
import com.intel.analytics.bigdl.nn.mkldnn.MklDnnContainer
import com.intel.analytics.bigdl.nn.mkldnn.Phase.TrainingPhase
import com.intel.analytics.bigdl.nn.{Container, Utils}
import com.intel.analytics.bigdl.optim.DistriOptimizer._
import com.intel.analytics.bigdl.parameters.AllReduceParameter
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils._
import com.intel.analytics.bigdl.visualization.{TrainSummary, ValidationSummary}
import com.intel.analytics.bigdl.{Module, _}
import java.io.{File, FilenameFilter}
import java.text.SimpleDateFormat
import java.util.Calendar
import org.apache.log4j.Logger
import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.concurrent.Future
import scala.reflect.{ClassTag, classTag}

object ParallelOptimizer extends AbstractOptimizer {

  import Optimizer._

  val logger: Logger = Logger.getLogger(getClass)

  /**
   * Train the model.
   *
   * @param dataset train dataset
   * @param coresPerNode cores per node
   * @param state state table
   * @param endWhen trigger to stop training
   * @param metrics metrics
   * @param models cached models
   * @param optimMethods optimization methods
   * @param validationTrigger validation trigger
   * @param validationDataSet validation dataset
   * @param validationMethods validation methods
   * @param cacheTrigger cache trigger
   * @param cachePath cache path
   * @param trainSummary train summary
   * @param validationSummary validation summary
   * @param isOverWrite if overwrite the checkpoint
   */
  private[optim] def optimize[T: ClassTag](
    trainingModel: Module[T],
    dataset: DistributedDataSet[MiniBatch[T]],
    coresPerNode: Int,
    state: Table,
    endWhen: Trigger,
    metrics: Metrics,
    models: RDD[Cache[T]],
    optimMethods: Map[String, OptimMethod[T]],
    validationTrigger: Option[Trigger],
    validationDataSet: Option[DataSet[MiniBatch[T]]],
    validationMethods: Option[Array[ValidationMethod[T]]],
    cacheTrigger: Option[Trigger],
    cachePath: Option[String],
    trainSummary: Option[TrainSummary],
    validationSummary: Option[ValidationSummary],
    isOverWrite: Boolean
  )(implicit ev: TensorNumeric[T]): Unit = {
    val sc = dataset.originRDD().sparkContext
    val partitionNum = dataset.originRDD().partitions.length
    var wallClockTime = 0L
    var lastEpochTime = 0L

    // driverState is needed to prevent serializing the whole optimizer
    optimMethods.values.foreach { optimMethod =>
      if (!optimMethod.state.contains("epoch")) optimMethod.state.update("epoch", 1)
      if (!optimMethod.state.contains("neval")) optimMethod.state.update("neval", 1)
      if (!optimMethod.state.contains("Loss")) {
        optimMethod.state.update("Loss", Float.PositiveInfinity)
      }
      if (!optimMethod.state.contains("score")) optimMethod.state.update("score", 0f)
      if (!optimMethod.state.contains("recordsProcessedThisEpoch")) {
        optimMethod.state.update("recordsProcessedThisEpoch", 0)
      }
    }

    val _subModelNumber = Engine.getEngineType() match {
      case MklBlas => coresPerNode
      case MklDnn => 1
    }

    require(_subModelNumber == 1, "currently only single model supported especially for mkldnn")

    val driverState = T(
      "epoch" -> optimMethods.values.head.state("epoch"),
      "neval" -> optimMethods.values.head.state("neval"),
      "Loss" -> optimMethods.values.head.state("Loss"),
      "score" -> optimMethods.values.head.state("score"),
      "parallelism" -> _subModelNumber
    )
    logger.info("Count dataset")
    val countBefore = System.nanoTime()
    val numSamples = dataset.data(train = false).map(_.size()).reduce(_ + _)
    val countAfter = System.nanoTime()
    logger.info(s"Count dataset complete. Time elapsed: ${(countAfter - countBefore) / 1e9}s")
    if (numSamples != dataset.size()) {
      logger.warn("If the dataset is built directly from RDD[Minibatch], the data in each " +
        "minibatch is fixed, and a single minibatch is randomly selected in each partition. If " +
        "the dataset is transformed from RDD[Sample], each minibatch will be constructed on the " +
        "fly from random samples, which is better for convergence.")
    }

    logger.info(s"config $state")
    var recordsProcessedThisEpoch = optimMethods.values.head.state[Int]("recordsProcessedThisEpoch")
    if (recordsProcessedThisEpoch == 0) {
      val shuffleBefore = System.nanoTime()
      logger.info("Shuffle data")
      dataset.shuffle()
      val shuffleEnd = System.nanoTime()
      logger.info(s"Shuffle data complete. Takes ${(shuffleEnd - shuffleBefore) / 1e9}s")
    }

    var tasks: ArrayBuffer[Future[_]] = new ArrayBuffer()
    var threshold = Long.MaxValue
    var timeout = Long.MaxValue
    var iteration = 0
    val dropPercentage = state.get[Double]("dropPercentage").get
    val warmupIterationNum = state.get[Int]("warmupIterationNum").get
    val computeThresholdbatchSize = state.get[Int]("computeThresholdbatchSize").get
    val maxDropPercentage = state.get[Double]("maxDropPercentage").get
    val iterationPerTime = System.getProperty("bigdl.parallelOptimizer." +
      "iterationPerTime", "1").toInt
    val driverSubModelNum = partitionNum * _subModelNumber * iterationPerTime
    var dropModelNumBatch = 0
    var lossArray = new Array[Double](_subModelNumber)

    var epochStart = System.nanoTime()
    var dataRDD = dataset.data(train = true)

    while (!endWhen(driverState)) {
      var lossSum = 0.0
      var recordsNum = 0
      metrics.set("computing time for each node", mutable.ArrayBuffer[Double](), sc)
      metrics.set("computing time average", 0.0, sc, partitionNum)

      val driverMetrics = metrics
      val start = System.nanoTime()
      /*
        Run the forwards/backwards pass using multiple threads in each partition, and track the
        number of model updates that finished before the thread timeout mechanism.
       */
      val (numFinishedModelUpdates, localLossSum, localRecordsNum) = dataRDD
        .zipPartitions(models, preservesPartitioning = true) { (data, modelIter) => {
          var count = 0
          var finishedThreadSize = 0
          val cached = modelIter.next()
          // val miniBatchBuffer = new Array[MiniBatch[T]](_subModelNumber)
          var miniBatch: MiniBatch[T] = null
          while (count < iterationPerTime) {
            val syWStart = System.nanoTime()
            miniBatch = data.next()
            // ======================Start train models===================================
            var time = System.nanoTime()
            if (dropPercentage > 0.0 && iteration > warmupIterationNum +
              computeThresholdbatchSize - 1) {
              timeout = threshold
            }
            val pre = (iteration % computeThresholdbatchSize) * _subModelNumber
            val trainingThreads = Engine.default.invokeAndWait2(Seq(() => {
              val trainStart = System.nanoTime()
              val localModel = cached.localModels(0)
              localModel.training()
              val localCriterion = cached.localCriterions(0)
              val input = miniBatch.getInput()
              val target = miniBatch.getTarget()
              val output = localModel.forward(input)
              lossArray(0) = ev.toType[Double](localCriterion.forward(output, target))
              val errors = localCriterion.backward(output, target)
              localModel.backward(input, errors)
              cached.moduleTimeList(0 + pre) = System.nanoTime() - trainStart
              0
            }), timeout)
            val computingTime = System.nanoTime() - time
            driverMetrics.add("computing time average", computingTime)
            driverMetrics.add("computing time for each node", computingTime)

            val finishedThreads = trainingThreads.filter(!_.isCancelled).map(_.get())
            val currFinishedSize = finishedThreads.size
            finishedThreadSize += currFinishedSize
            recordsNum += currFinishedSize * miniBatch.size
            var i = 0
            while (i < currFinishedSize) {
              lossSum += lossArray(finishedThreads(i))
              i += 1
            }
            count += 1
          }
          val end = System.nanoTime()
          wallClockTime += end - start
          Iterator.single(finishedThreadSize, lossSum, recordsNum)
        }
        }.reduce((a, b) => (a._1 + b._1, a._2 + b._2, a._3 + b._3))

      dropModelNumBatch += (driverSubModelNum - numFinishedModelUpdates)

      if (dropPercentage == 0.0 ||
        numFinishedModelUpdates >= driverSubModelNum * (1.0 - maxDropPercentage)) {
        driverState("numFinishedModel") = numFinishedModelUpdates
        recordsProcessedThisEpoch += localRecordsNum
        val end = System.nanoTime()
        wallClockTime += end - start
        driverState("Loss") = localLossSum / numFinishedModelUpdates
        optimMethods.foreach { v =>
          v._2.updateHyperParameter()
        }

        driverState(s"LearningRate") = optimMethods.head._2.getLearningRate().toFloat

        driverState("Throughput") = localRecordsNum.toFloat / ((end - start) / 1e9f)
        val _header = header(driverState[Int]("epoch"), recordsProcessedThisEpoch, numSamples,
          driverState[Int]("neval"), wallClockTime)
        logger.info(s"${_header} Trained ${localRecordsNum} records in ${(end - start) / 1e9} " +
          s"seconds. Throughput is ${driverState("Throughput")} records/second. Loss is ${
            driverState("Loss")
          }.")
        logger.debug("\n" + metrics.summary())
        logger.debug("Dropped modules: " + (driverSubModelNum - numFinishedModelUpdates))
        lossArray = new Array[Double](_subModelNumber)

        iteration += 1
        if (dropPercentage > 0.0 && iteration > warmupIterationNum &&
          iteration % computeThresholdbatchSize == 0) {
          val moduleTimeList = models.mapPartitions { iter =>
            iter.next().moduleTimeList.iterator
          }.collect()

          val k = (dropPercentage * computeThresholdbatchSize * driverSubModelNum).toInt
          if (k > dropModelNumBatch) {
            threshold = Util.kthLargest(moduleTimeList, 0, moduleTimeList.length - 1,
              k - dropModelNumBatch)
          } else {
            threshold = (threshold * 1.01).toLong
          }
          logger.info("threshold: " + threshold)

          // clear moduleTimeList in each node
          models.mapPartitions { iter =>
            val timeList = iter.next.moduleTimeList
            var i = 0
            while (i < timeList.length) {
              timeList(i) = 0
              i += 1
            }
            Iterator.empty
          }.count()
          dropModelNumBatch = 0
        }
        driverState("neval") = driverState[Int]("neval") + iterationPerTime
        if (recordsProcessedThisEpoch >= numSamples) {
          // Epoch is finished
          val epochEnd = System.nanoTime()
          wallClockTime = lastEpochTime + epochEnd - epochStart
          lastEpochTime = wallClockTime
          epochStart = System.nanoTime()
          logger.info(s"${_header} Epoch finished. Wall clock time is ${wallClockTime / 1e6} ms")

          driverState("epoch") = driverState[Int]("epoch") + 1
          dataset.shuffle()
          dataRDD = dataset.data(train = true)
          recordsProcessedThisEpoch = 0
        }

        optimMethods.map { case (moduleName, optimMethod) =>
          optimMethod.state.update("recordsProcessedThisEpoch", recordsProcessedThisEpoch)
          optimMethod.state.update("epoch", driverState[Int]("epoch"))
          optimMethod.state.update("neval", driverState[Int]("neval"))
          optimMethod.state.update("Loss", driverState[Float]("Loss"))
          if (validationMethods.isDefined) {
            optimMethod.state.update("score", driverState[Float]("score"))
          }
        }

        // update parameters for last iteration
        if (endWhen(driverState)) {
          logger.info(s"training finished, updating all layers parameters")
          models.mapPartitions(modelIter => {
            val localModels = modelIter.next.localModels
            val updateTaskes = localModels.map(localModel => () => {
              updateLayerParameters(localModel)
            })
            Engine.default.invokeAndWait2(updateTaskes)
            Iterator.empty
          }).collect
        }
        validate(
          validationTrigger,
          validationDataSet,
          validationMethods,
          coresPerNode,
          models,
          driverState,
          validationSummary,
          _header
        )

        trainSummary.foreach { summary =>
          saveSummary(
            summary,
            models,
            driverState,
            null,
            trainingModel
          )
        }

        checkpoint(
          cacheTrigger,
          cachePath,
          isOverWrite,
          wallClockTime,
          models,
          driverState,
          null,
          optimMethods,
          trainingModel
        )
      } else {
        logger.info(s"Warning! Not enough training samples were successfully processed in this " +
          s"iteration due to some slow tasks. The gradients computed in this iteration will be " +
          s"discarded. Only $numFinishedModelUpdates/$driverSubModelNum threads successfully " +
          s"completed training.")
      }
    }
  }

  private def updateLayerParameters[T: ClassTag](module: Module[T]): Unit = {
    module.updateParameter
    if (module.isInstanceOf[Container[_, _, T]]) {
      module.asInstanceOf[Container[_, _, T]].modules.foreach(sub => {
        updateLayerParameters(sub)
      })
    }
  }

  /**
   * Init engine and cache models, weights, gradients, criterions, state tables
   * and validation methods on worker nodes.
   *
   * @param model train model
   * @param dataset train dataset
   * @param criterion loss function
   * @param state state table
   * @param nodeNumber node number
   * @param coresPerNode cores per node
   * @param checkSingleton if checkSingleton
   * @param validationMethods validation methods
   * @param optimMethod optimization method
   * @return cached models
   */
  private def initThreadModels[T: ClassTag](
    model: Module[T],
    dataset: DistributedDataSet[MiniBatch[T]],
    criterion: Criterion[T],
    state: Table,
    nodeNumber: Int,
    coresPerNode: Int,
    checkSingleton: Boolean,
    validationMethods: Option[Array[ValidationMethod[T]]],
    optimMethod: Map[String, OptimMethod[T]],
    priorities: mutable.Map[String, Int]
  )(implicit ev: TensorNumeric[T]) = {
    val sc = dataset.originRDD().sparkContext
    val broadcast = sc.broadcast((criterion, state, validationMethods, optimMethod))
    val modelBroadcast = ModelBroadcast[T]().broadcast(sc, model)
    model.getParameters()
    val _subModelNumber = Engine.getEngineType match {
      case MklBlas => coresPerNode
      case MklDnn => 1
    }

    require(dataset.originRDD().partitions.length == nodeNumber,
      s"Passed in rdd partition number ${dataset.originRDD().partitions.length}" +
        s" is not equal to configured node number ${nodeNumber}")

    val computeThresholdbatchSize = state.get[Int]("computeThresholdbatchSize").get
    val nExecutor = Engine.nodeNumber()

    val parameterBlocks = System.getProperty("bigdl.parallelOptimizer." +
      "parameterBlocks", "10").toInt

    val models = dataset.originRDD().mapPartitions(_ => {
      val partitionId = TaskContext.getPartitionId
      val (broadcastCriterion, broadcastState, broadcastMethod,
      broadcastOptim) = broadcast.value
      if (!Engine.checkSingleton()) {
        if (checkSingleton) {
          require(Engine.checkSingleton(), "Partitions of the training data are not evenly" +
            "distributed across the executors in the Spark cluster; are there sufficient training" +
            "data to be distributed? Set property \"bigdl.check.singleton\" to false to skip " +
            "this check")
        } else {
          logger.warn("Partitions of the training data are not evenly" +
            "distributed across the executors in the Spark cluster; are there sufficient training" +
            "data to be distributed?")
        }
      }
      Engine.setNodeAndCore(nExecutor, coresPerNode)
      // initialize synchronizer with partition ID and parition number
      val synchronizer = new BlockManagerParameterSynchronizer[T](partitionId, nExecutor)
      val cached = (0 until _subModelNumber).map { _ =>
        val localModel = modelBroadcast.value(true, false)
        localModel match {
          case container: MklDnnContainer => container.compile(TrainingPhase)
          case _ =>
        }
        // differentiate partition models from each other by partition ID
        setModelId(localModel, partitionId)
        // set parameter synchronizer
        setDistriPartitionsynchronizer(localModel, synchronizer, new mutable.HashMap[Int, Int](),
          parameterBlocks)
        val localCriterion = broadcastCriterion.cloneCriterion()
        val localState = broadcastState.clone()
        val localMethod = if (broadcastMethod.isDefined) {
          Some(broadcastMethod.get.map(_.clone()))
        } else None
        (localModel, Tensor[T](0), Tensor[T](0), localCriterion, localState, localMethod)
      }.toArray

      logger.info("model thread pool size is " + Engine.model.getPoolSize)
      val weights = cached.head._2
      Iterator.single(CacheV1(
        cached.map(_._1), // models
        cached.map(_._2), // weights
        cached.map(_._3), // gradients
        cached.map(_._4), // criterions
        cached.map(_._5), // states
        new Array[Long](_subModelNumber * computeThresholdbatchSize),
        cached.map(_._6),
        broadcastOptim.map(v => (v._1, v._2.clone())),
        synchronizer
      ))
    }).persist()
    models.setName("Thread Model RDD")
    logger.info("Cache thread models...")
    models.count()
    logger.info("Cache thread models... done")
    models
  }

  private def getExecutionOrder[T: ClassTag](module: Module[T]): ArrayBuffer[Module[T]] = {
    val res = new ArrayBuffer[Module[T]]
    if (module.isInstanceOf[Container[_, _, T]]) {
      val subModules = module.asInstanceOf[Container[_, _, T]].modules
      subModules.foreach(sub => {
        res ++= getExecutionOrder(sub)
      })
    } else {
      if (module.parameters() != null) {
        res += module
      }
    }
    res
  }

  private def setDistriPartitionsynchronizer[T: ClassTag](model: Module[T],
    parameterSynchronizer: DistriParameterSynchronizer[T],
    barrierLayers: mutable.Map[Int, Int], slices: Int): Unit = {
    val globalWeights = model.getParameters()._1
    val globalGrads = model.getParameters()._2
    val totalSize = globalGrads.nElement
    val executorOrders = getExecutionOrder(model)
    var i = executorOrders.length - 1
    val size = totalSize / slices - 1
    val extraSize = totalSize - size * (slices - 1)
    var lastOffSet = totalSize
    while (i >= 0) {
      val currModule = executorOrders(i)
      if (currModule.parameters() != null) {
        val grads = currModule.getParameters()._1
        val offSet = grads.storageOffset - 1
        val index = if (offSet == 0) 0 else (offSet - 1) / size + 1
        val currParSize = lastOffSet - offSet
        if (index < slices) {
          if (!barrierLayers.contains(index)) {
            barrierLayers.put(index, offSet)
            val weightsPar = globalWeights.narrow(1, offSet + 1, currParSize)
            val gradsPar = globalGrads.narrow(1, offSet + 1, currParSize)
            parameterSynchronizer.init(currModule.getName, currParSize,
              executorOrders.length - i, weightsPar, gradsPar)
            currModule.setParameterSynchronizer(parameterSynchronizer)
            lastOffSet = offSet
          }
        }
      }
      i -= 1
    }
  }

  private def setModelId[T: ClassTag](model: Module[T], partitionId: Int): Unit = {
    model.setId(partitionId)

    if (model.isInstanceOf[Container[_, _, T]]) {
      model.asInstanceOf[Container[_, _, T]].modules.
        foreach(sub => setModelId(sub, partitionId))
    }
  }

  /**
   * Fetch current model parameters to driver, and copy to trainingModel.
   *
   * @param models cached models
   * @param trainingModel the model is trained by optimizer
   * @return trained model
   */
  override protected def getModel[T: ClassTag](
    models: RDD[Cache[T]],
    parameters: AllReduceParameter[T],
    trainingModel: Module[T])(implicit ev: TensorNumeric[T]): Module[T] = {
    val partitionNum = models.partitions.length
    val extraState = models.map(_.localModels.head.getExtraParameter()).first()
    trainingModel.setExtraParameter(extraState)
    // make sure gradient is as the same length as weight
    val parameterArray = trainingModel.parameters()
    (0 until parameterArray._2.length).foreach(i =>
      parameterArray._2(i).resizeAs(parameterArray._1(i))
    )

    val parameter = trainingModel.getParameters()._1
    val _classTag = classTag[T]
    val size = parameter.storage().array().length
    val taskSize = size / partitionNum
    val extraSize = size % partitionNum
    val weights = models.mapPartitions(iter => {
      val localCache = iter.next
      val localModels = localCache.localModels
      val localWeights = localModels.head.getParameters()._1
      val synchronizer = localCache.parameterSynchronizer
        .asInstanceOf[BlockManagerParameterSynchronizer[T]]
      val partitionId = synchronizer.partitionID
      val start = partitionId * taskSize + math.min(partitionId, extraSize)
      val length = taskSize + (if (partitionId < extraSize) 1 else 0)
      val partitionWeight = Tensor[T](length)
      partitionWeight.copy(localWeights.narrow(1, start + 1, length))
      Iterator.single(Map(partitionId -> partitionWeight))
    }).reduce((a, b) => (a ++ b))

    (0 until partitionNum).map(pid => {
      val start = parameter.storageOffset + pid * taskSize + math.min(pid, extraSize)
      val length = taskSize + (if (pid < extraSize) 1 else 0)
      parameter.narrow(1, start, length).copy(weights(pid))
    })
    trainingModel
  }
}

/**
 * The optimizer run on a distributed cluster.
 *
 * @param _model train model
 * @param _dataset train dataset
 * @param _criterion loss function
 */
class ParallelOptimizer[T: ClassTag](
  _model: Module[T],
  _dataset: DistributedDataSet[MiniBatch[T]],
  _criterion: Criterion[T]
)(implicit ev: TensorNumeric[T])
  extends Optimizer[T, MiniBatch[T]](
    _model, _dataset, _criterion) {
  val metrics = new Metrics

  private var models: RDD[DistriOptimizer.CacheV1[T]] = null

  private var _priorities: mutable.Map[String, Int] = null

  def setPriorities(priorities: mutable.Map[String, Int]): Unit = {
    this._priorities = priorities
  }

  /**
   * Clean some internal states, so this or other optimizers can run optimize again
   *
   * This method will be called at the end of optimize. You need not call it if optimize succeed.
   * If the optimize fails, you may call it before next optimize.
   */
  def clearState(): Unit = {
    ParallelOptimizer.clearState(models.asInstanceOf[RDD[Cache[T]]])
  }

  private def endEpoch(): Unit = {
    ParallelOptimizer.endEpoch(optimMethods)
  }

  override def setTrainData(sampleRDD: RDD[Sample[T]],
    batchSize: Int,
    miniBatch: MiniBatch[T]): this.type = {
    this.dataset = ParallelOptimizer.setTrainData(sampleRDD, batchSize, miniBatch)
    // if current epoch is not finished, we will end the
    // current epoch and start a new epoch when optimize is called
    endEpoch()
    this
  }

  override def setTrainData(sampleRDD: RDD[Sample[T]],
    batchSize: Int,
    featurePaddingParam: PaddingParam[T] = null,
    labelPaddingParam: PaddingParam[T] = null): this.type = {
    val _featurePaddingParam = if (featurePaddingParam != null) Some(featurePaddingParam) else None
    val _labelPaddingParam = if (labelPaddingParam != null) Some(labelPaddingParam) else None
    this.dataset = ParallelOptimizer.setTrainData(sampleRDD, batchSize,
      featurePaddingParam, labelPaddingParam)
    // if current epoch is not finished, we will end the
    // current epoch and start a new epoch when optimize is called
    endEpoch()
    this
  }


  override def prepareInput(): Unit = {
    if (!dataset.toDistributed().isCached) {
      ParallelOptimizer.logger.info("caching training rdd ...")
      ParallelOptimizer.prepareInput(this.dataset, this.validationDataSet)
    }
  }

  private def expandOptimMethods(optimMethodMap: mutable.Map[String, OptimMethod[T]]): Unit = {
    if (this.model.isInstanceOf[Container[_, _, T]]) {
      expandOptimMethodsForSubModules(this.model.
        asInstanceOf[Container[_, _, T]].modules,
        optimMethodMap(this.model.getName), optimMethodMap)
    } else {
      require(optimMethodMap.contains(this._model.getName),
        "single layer model should have optim method set")
    }

    if (optimMethodMap.contains(this.model.getName)) {
      this.model.setOptimMethod(optimMethodMap.get(this.model.getName).get)
    }
  }

  private def expandOptimMethodsForSubModules(subModules: ArrayBuffer[Module[T]],
    parentMethod: OptimMethod[T],
    optimMethodMap: mutable.Map[String, OptimMethod[T]]): Unit = {
    subModules.foreach(sub => {
      if (optimMethodMap.get(sub.getName) == None) {
        require(parentMethod != null, s"${sub.getName}'s parent optim method should not be null")
        val subOptimMethod = parentMethod.clone
        sub.setOptimMethod(subOptimMethod)
        optimMethodMap(sub.getName) = subOptimMethod
      }
      if (sub.isInstanceOf[Container[_, _, T]]) {
        val currMethod = optimMethodMap(sub.getName)
        expandOptimMethodsForSubModules(sub.asInstanceOf[Container[_, _, T]].modules,
          currMethod, optimMethodMap)
      }
    })
  }

  private def defaultPrioritize(): mutable.HashMap[String, Int] = {
    val priorities = new mutable.HashMap[String, Int]
    val orders = ParallelOptimizer.getExecutionOrder(this._model)
    val len = orders.size
    orders.zipWithIndex.foreach(order => {
      priorities.put(order._1.getName, len - order._2)
    })
    priorities
  }

  override def optimize(): Module[T] = {

    val distDataset = dataset.toDistributed()

    optimMethods.values.foreach { optimMethod =>
      optimMethod.clearHistory()
    }

    // To be compatible with the old usage that user define hyperparameters in a table.
    if (optimMethods.size == 1) {
      optimMethods.head._2.loadFromTable(state)
    }
    val parallelOptimMethods = scala.collection.mutable.Map(optimMethods.toSeq: _*)
    // expand optim method so that each layer has its own optim method
    expandOptimMethods(parallelOptimMethods)

    if (_priorities == null) {
      _priorities = defaultPrioritize
    }

    optimMethods = collection.immutable.Map(parallelOptimMethods.toSeq: _*)

    state("dropPercentage") = dropPercentage
    state("warmupIterationNum") = warmupIterationNum
    state("computeThresholdbatchSize") = computeThresholdbatchSize
    state("maxDropPercentage") = maxDropPercentage
    state("isLayerwiseScaled") = Utils.isLayerwiseScaled(_model)

    val nodeNumber = Engine.nodeNumber()
    val coresPerNode = Engine.coreNumber()

    val partitionNum = distDataset.originRDD().partitions.length

    prepareInput()

    models = ParallelOptimizer.initThreadModels(model, distDataset, criterion, state,
      nodeNumber, coresPerNode, checkSingleton, validationMethods,
      optimMethods, _priorities)

    if (checkpointPath.isDefined) {
      val file = checkpointPath.get + "/" +
        new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime())
      new File(file).mkdir()
      checkpointPath = Some(file)
    }

    var retryNum = 0
    val maxRetry = System.getProperty("bigdl.failure.retryTimes", "5").toInt
    val retryTimeInterval = System.getProperty("bigdl.failure.retryTimeInterval", "120").toInt
    var lastFailureTimestamp = System.nanoTime()

    ParallelOptimizer.optimize(
      model,
      distDataset,
      coresPerNode,
      state,
      endWhen,
      metrics,
      models.asInstanceOf[RDD[Cache[T]]],
      optimMethods,
      validationTrigger,
      validationDataSet,
      validationMethods,
      checkpointTrigger,
      checkpointPath,
      trainSummary,
      validationSummary,
      isOverWrite
    )

    ParallelOptimizer.getModel(models.asInstanceOf[RDD[Cache[T]]], null, model)

    // Reset some internal states, so this or other optimizers can run optimize again
    clearState()

    // release distributed synchronizer resources

    models.foreach(modelIter => {
      modelIter.parameterSynchronizer.clear
    })

    // unpersist the model because the next time optimize is called, new `models` will be
    // created
    models.unpersist()

    model
  }

  private def getLatestFile(path: String, fileName: String): String = {
    val fl = new java.io.File(path)
    val files = fl.listFiles(new FilenameFilter {
      override def accept(dir: File, name: String): Boolean = {
        name.startsWith(fileName)
      }
    })

    var lastMod = Long.MinValue
    var choice: String = null
    files.map { file =>
      if (file.lastModified() > lastMod) {
        choice = file.getPath;
        lastMod = file.lastModified();
      }
    }
    return choice;
  }
}