com.intel.analytics.bigdl.dllib.nn.abstractnn.AbstractModule.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bigdl-dllib-spark_3.1.3 Show documentation
The newest version!
/*
 * Copyright 2016 The BigDL Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.intel.analytics.bigdl.dllib.nn.abstractnn

import java.nio.ByteOrder

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dllib.feature.dataset._
import com.intel.analytics.bigdl.dllib.nn.Graph.ModuleNode
import com.intel.analytics.bigdl.dllib.nn.quantized.Quantization
import com.intel.analytics.bigdl.dllib.nn.{Module, _}
import com.intel.analytics.bigdl.dllib.optim._
import com.intel.analytics.bigdl.dllib.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.dllib.tensor.{QuantizedTensor, Tensor, TensorDataType}
import com.intel.analytics.bigdl.dllib.feature.transform.vision.image.{DistributedImageFrame, ImageFeature, ImageFrame, LocalImageFrame}
import com.intel.analytics.bigdl.dllib.utils.TorchObject.TYPE_MODULE
import com.intel.analytics.bigdl.dllib.utils._
import com.intel.analytics.bigdl.dllib.utils._
import com.intel.analytics.bigdl.dllib.utils.caffe.CaffePersister
import com.intel.analytics.bigdl.dllib.utils.intermediate.ConversionUtils
import com.intel.analytics.bigdl.dllib.utils.serializer._
import com.intel.analytics.bigdl.dllib.utils.tf.{TensorflowDataFormat, TensorflowSaver}
import org.apache.commons.lang3.SerializationUtils
import org.apache.spark.rdd.RDD

import scala.collection.mutable
import scala.reflect.ClassTag

/**
 * [[TensorModule]] is an abstract sub-class of [[AbstractModule]], whose
 * input and output type both are [[Tensor]].
 *
 * @tparam T The numeric type in this module parameters
 */
abstract class TensorModule[T: ClassTag]
  (implicit ev: TensorNumeric[T]) extends AbstractModule[Tensor[T], Tensor[T], T]

/**
 * Module is the basic component of a neural network. It forward activities and backward gradients.
 * Modules can connect to others to construct a complex neural network.
 *
 * @tparam A Input data type
 * @tparam B Output data type
 * @tparam T The numeric type in this module parameters.
 */
abstract class AbstractModule[A <: Activity: ClassTag, B <: Activity: ClassTag, T: ClassTag](
  implicit ev: TensorNumeric[T]) extends Serializable with InferShape{

  // ================================= Public APIs =============================================


  /**
   * The cached output. So we don't compute it again when need it
   */
  var output: B = Activity.allocate[B, T]()

  /**
   * The cached gradient of activities. So we don't compute it again when need it
   */
  var gradInput: A = Activity.allocate[A, T]()

  protected var inputsFormats: Seq[Int] = null
  protected var outputsFormats: Seq[Int] = null

  /**
   * set input formats for graph
   * @param formats
   * @return
   */
  def setInputFormats(formats: Seq[Int]): this.type = {
    inputsFormats = formats
    this
  }

  /**
   * set output formats for graph
   * @param formats
   * @return
   */
  def setOutputFormats(formats: Seq[Int]): this.type = {
    outputsFormats = formats
    this
  }

  /**
   * Get the scale of gradientWeight
   */
  final def getScaleW(): Double = {
    scaleW
  }

  /**
   * Get the scale of gradientBias
   */
  final def getScaleB(): Double = {
    scaleB
  }

  /**
   * Set the scale of gradientWeight
   *
   * @param w the value of the scale of gradientWeight
   * @return this
   */
  def setScaleW(w: Double): this.type = {
    scaleW = w
    this
  }

  /**
   * Set the scale of gradientBias
   *
   * @param b the value of the scale of gradientBias
   * @return this
   */
  def setScaleB(b: Double): this.type = {
    scaleB = b
    this
  }

  /**
   * Clear cached activities to save storage space or network bandwidth. Note that we use
   * Tensor.set to keep some information like tensor share
   *
   * The subclass should override this method if it allocate some extra resource, and call the
   * super.clearState in the override method
   *
   * @return
   */
  def clearState() : this.type = {
    if (output.isInstanceOf[Tensor[_]]) {
      output.asInstanceOf[Tensor[_]].set()
    }

    if (gradInput.isInstanceOf[Tensor[_]]) {
      gradInput.asInstanceOf[Tensor[_]].set()
    }

    this
  }

  /**
   * Whether user set a name to the module before
   * @return
   */
  final def hasName: Boolean = name != null

  /**
   * Set the module name
   *
   * @param name
   * @return
   */
  final def setName(name : String) : this.type = {
    this.name = name
    this
  }

  /**
   * Get the module name, default name is className@namePostfix
   *
   * @return
   */
  final def getName() : String = {
    if (this.name == null) {
      s"${this.getClass.getSimpleName}${namePostfix}"
    } else {
      this.name
    }
  }

  override def toString(): String = getPrintName

  /**
   * Get the forward/backward cost time for the module or its submodules
   * @return
   */
  def getTimes(): Array[(AbstractModule[_ <: Activity, _ <: Activity, T], Long, Long)] = {
    Array((this, forwardTime, backwardTime))
  }

  /**
   * Get the forward/backward cost time for the module or its submodules
   * and group by module type.
   * @return (module type name, forward time, backward time)
   */
  final def getTimesGroupByModuleType():
      Array[(String, Long, Long)] = {
    this.getTimes().map(v => (v._1.getClass().getName(), v._2, v._3)).groupBy(_._1)
      .map(v => (v._1, v._2.reduce((a, b) => (v._1, a._2 + b._2, a._3 + b._3))))
      .map(v => (v._1, v._2._2, v._2._3))
      .toArray
      .sortWith((a, b) => (a._2 + a._3) > (b._2 + b._3))
  }

  /**
   * Reset the forward/backward record time for the module or its submodules
   * @return
   */
  def resetTimes(): Unit = {
    forwardTime = 0
    backwardTime = 0
  }

  /**
   * freeze the module,
   * i.e. their parameters(weight/bias, if exists) are not changed in training process
   * if names is not empty,
   * set an array of layers that match the given ```names``` to be "freezed",
   *
   * @param names an array of layer names
   * @return current graph model
   */
  def freeze(names: String*): this.type = {
    if (names.isEmpty) {
      // in case when freeze is called many times
      if (scaleW != 0) {
        scaleWCache = scaleW
        scaleW = 0
      }
      if (scaleB != 0) {
        scaleBCache = scaleB
        scaleB = 0
      }
    } else {
      names.foreach(name => {
        this (name) match {
          case Some(x) => x.freeze()
          case _ =>
            Log4Error.invalidOperationError(false, s"cannot match module named $name")
        }
      })
    }
    this
  }

  /**
   * "unfreeze" module, i.e. make the module parameters(weight/bias, if exists)
   * to be trained(updated) in training process
   * if names is not empty, unfreeze layers that match given names
   *
   * @param names array of module names to unFreeze
   */
  def unFreeze(names: String*): this.type = {
    if (names.isEmpty) {
      scaleW = scaleWCache
      scaleB = scaleBCache
    } else {
      names.foreach(name => {
        this (name) match {
          case Some(x) => x.unFreeze()
          case _ =>
            Log4Error.invalidOperationError(false, s"cannot match module named $name")
        }
      })
    }
    this
  }

  /**
   * Takes an input object, and computes the corresponding output of the module. After a forward,
   * the output state variable should have been updated to the new value.
   *
   * @param input input data
   * @return output data
   */
  final def forward(input: A): B = {
    val before = System.nanoTime()
    try {
      updateParameter
      updateOutput(input)
    } catch {
      case l: IllegalArgumentException =>
        Log4Error.invalidOperationError(false, l.getMessage, cause = l)
        null
      case u: InvalidOperationException =>
        Log4Error.invalidOperationError(false, u.getMessage, cause = u)
        null
      case e: Throwable =>
        val errormsg = this.toString() + "\n" + e.getMessage
        Log4Error.unKnowExceptionError(false, errormsg, cause = e)
        null
    }
    forwardTime += System.nanoTime() - before

    output
  }

  /**
   * Performs a back-propagation step through the module, with respect to the given input. In
   * general this method makes the assumption forward(input) has been called before, with the same
   * input. This is necessary for optimization reasons. If you do not respect this rule, backward()
   * will compute incorrect gradients.
   *
   * @param input input data
   * @param gradOutput gradient of next layer
   * @return gradient corresponding to input data
   */
  def backward(input: A, gradOutput: B): A = {
    val before = System.nanoTime()
    updateGradInput(input, gradOutput)
    accGradParameters(input, gradOutput)
    backwardTime += System.nanoTime() - before
    asyncGradient
    gradInput
  }

  private[bigdl] def asyncGradient(): Unit = {
    if (this.getParameterSynchronizer() != null) {
      if (this.parameters() != null) {
        this.getParameterSynchronizer.put(this.getName)
      }
    }
  }

  /**
   * Computes the output using the current parameter set of the class and input. This function
   * returns the result which is stored in the output field.
   *
   * @param input
   * @return
   */
  def updateOutput(input: A): B

  /**
   * Computing the gradient of the module with respect to its own input. This is returned in
   * gradInput. Also, the gradInput state variable is updated accordingly.
   *
   * @param input
   * @param gradOutput
   * @return
   */
  def updateGradInput(input: A, gradOutput: B): A

  /**
   * Computing the gradient of the module with respect to its own parameters. Many modules do not
   * perform this step as they do not have any parameters. The state variable name for the
   * parameters is module dependent. The module is expected to accumulate the gradients with
   * respect to the parameters in some variable.
   *
   * @param input
   * @param gradOutput
   */
  def accGradParameters(input: A, gradOutput: B): Unit = {}

  /**
   * If the module has parameters, this will zero the accumulation of the gradients with respect
   * to these parameters. Otherwise, it does nothing.
   */
  def zeroGradParameters(): Unit = {
    if (parameters() != null) {
      parameters()._1.zip(parameters()._2)foreach{ case (weight, grad) =>
        grad.resizeAs(weight).zero()
      }
    }
  }

  /**
   * This function returns two arrays. One for the weights and the other the gradients
   * Custom modules should override this function if they have parameters
   *
   * @return (Array of weights, Array of grad)
   */
  def parameters(): (Array[Tensor[T]], Array[Tensor[T]]) = null

  /**
   * Get extra parameter in this module.
   * Extra parameter means the trainable parameters beside weight and bias. Such as runningMean
   * and runningVar in BatchNormalization.
   *
   * The subclass should override this method if it has some parameters besides weight and bias.
   *
   * @return an array of tensor
   */
  def getExtraParameter(): Array[Tensor[T]] = null

  /**
   * Set extra parameter to this module.
   * Extra parameter means the trainable parameters beside weight and bias. Such as runningMean
   * and runningVar in BatchNormalization.
   *
   * @return this
   */
  def setExtraParameter(extraParam: Array[Tensor[T]]): this.type = {
    val currentExtraParam = this.getExtraParameter()
    if (extraParam != null && currentExtraParam != null) {
      Log4Error.invalidInputError(extraParam.length == currentExtraParam.length,
        "state's length doesn't match, excepted:" +
          s"${currentExtraParam.length}, but got  ${extraParam.length}")
      var i = 0
      while (i < extraParam.length) {
        currentExtraParam(i).copy(extraParam(i))
        i += 1
      }
      this
    } else if (extraParam == null && currentExtraParam == null) {
      this
    } else {
      Log4Error.invalidOperationError(false, s"module's extraParameter is $currentExtraParam" +
        s", while setting param is ${extraParam}")
      this
    }
  }

  /**
   * This function returns a table contains ModuleName, the parameter names and parameter value
   * in this module.
   *
   * The result table is a structure of Table(ModuleName -> Table(ParameterName -> ParameterValue)),
   * and the type is Table[String, Table[String, Tensor[T]]].
   *
   * For example, get the weight of a module named conv1:
   *   table[Table]("conv1")[Tensor[T]]("weight").
   *
   * The names of the parameters follow such convention:
   *
   * 1. If there's one parameter, the parameter is named as "weight", the gradient is named as
   * "gradWeight"
   *
   * 2. If there're two parameters, the first parameter is named as "weight", the first gradient is
   * named as "gradWeight"; the second parameter is named as "bias", the seconcd gradient is
   * named as "gradBias"
   *
   * 3. If there're more parameters, the weight is named as "weight" with a seq number as suffix,
   * the gradient is named as "gradient" with a seq number as suffix
   *
   * Custom modules should override this function the default impl if the convention doesn't meet
   * the requirement.
   *
   * @return Table
   */
  def getParametersTable(): Table = {
    val params = parameters()
    if (params == null) return null
    val (weights, gradients) = params
    Log4Error.invalidInputError(gradients.length == weights.length,
      "weight number is not equal to grad number")

    if (weights.length == 1) {
      T(getName() -> T("weight" -> weights(0), "gradWeight" -> gradients(0)))
    } else if (weights.length == 2) {
      T(getName() -> T("weight" -> weights(0), "bias" -> weights(1),
        "gradWeight" -> gradients(0), "gradBias" -> gradients(1)))
    } else {
      val result = T()
      weights.zip(gradients).zipWithIndex.map { case ((w, g), i) =>
        result(s"weight$i") = w
        result(s"gradient$i") = g
      }
      T(getName() -> result)
    }
  }

  /**
   * Set the module to training mode
   * @return
   */
  def training(): this.type = {
    train = true
    this
  }

  /**
   * Set the module to evaluate mode
   * @return
   */
  def evaluate(): this.type = {
    train = false
    this
  }

  /**
   * Check if the model is in training mode
   * @return
   */
  final def isTraining(): Boolean = {
    this.train
  }

  /**
   * Check if the model is PyTorch model
   * @return
   */
  def isPyTorch(): Boolean = {
    false
  }

  /**
   * Check if the model is TensorFlow model
   * @return
   */
  def isTensorFlow(): Boolean = {
    false
  }

  /**
   * Reset module parameters, which is re-initialize the parameter with given initMethod
   */
  def reset(): Unit = {}

  /**
   * Set the line separator when print the module
   * @param line
   * @return
   */
  final def setLine(line: String): this.type = {
    this.line = line
    this
  }

  /**
   * Clone the model
   * @return
   */
  final def cloneModule(): this.type = {
    SerializationUtils.clone(this)
  }

  /**
   * Clone the module, deep or shallow copy
   * @param deepCopy
   * @return
   */
  final def clone(deepCopy : Boolean): AbstractModule[A, B, T] = {
    val moduleData = ModuleData[T](this.
      asInstanceOf[AbstractModule[Activity, Activity, T]], Seq[String](), Seq[String]())
    val storages = new mutable.HashMap[Int, Any]()
    val context = SerializeContext(moduleData, storages, ProtoStorageType, false)
    val serializedModule = ModuleSerializer.serialize[T](context).bigDLModule
    ModulePersister.setTensorStorage(serializedModule, storages)

    storages.clear()

    val deserializeContext = DeserializeContext(serializedModule.build,
      storages, ProtoStorageType, false)
    ModuleLoader.initTensorStorage[T](deserializeContext)
    val copy = ModuleSerializer.load[T](deserializeContext).module
      .asInstanceOf[AbstractModule[A, B, T]]
    setWeightAndBias(copy, deepCopy)
    copy
  }

  override def equals(other: Any): Boolean = other match {
    case that: AbstractModule[A, B, T] =>
      (that canEqual this) &&
        (that.getClass equals this.getClass) &&
        output == that.output &&
        gradInput == that.gradInput &&
        name == that.name
    case _ => false
  }

  override def hashCode(): Int = {
    def getHashCode(a: Object): Int = if (a == null) 0 else a.hashCode()
    val state = Seq(output, gradInput, this.getClass, this.name)
    state.map(getHashCode).foldLeft(0)((a, b) => 31 * a + b)
  }

  /**
   * Save this module to path.
   * @param path path to save module, local file system, HDFS and Amazon S3 is supported.
   *             HDFS path should be like "hdfs://[host]:[port]/xxx"
   *             Amazon S3 path should be like "s3a://bucket/xxx"
   * @param overWrite if overwrite
   * @return self
   */
  @deprecated("please use recommended saveModule(path, overWrite)", "0.3.0")
  private[bigdl] def save(path : String, overWrite: Boolean = false) : this.type = {
    this.clearState()
    File.save(this, path, overWrite)
    this
  }

  /**
   * Save this module to path with protobuf format
   * @param path path to save module, local file system, HDFS and Amazon S3 is supported.
   *             HDFS path should be like "hdfs://[host]:[port]/xxx"
   *             Amazon S3 path should be like "s3a://bucket/xxx"
   * @param weightPath where to store weight
   * @param overWrite if overwrite
   * @return self
   */
  final def saveModule(path : String, weightPath : String = null,
    overWrite: Boolean = false) : this.type = {
    this.clearState()
    ModulePersister.saveToFile(path, weightPath, this, overWrite)
    this
  }

  final def saveModel(path : String, weightPath : String = null,
    overWrite: Boolean = false) : this.type = {
    this.saveModule(path, weightPath, overWrite)
    this
  }

  /**
   * Save this module definition to path.
   * @param path path to save module, local file system, HDFS and Amazon S3 is supported.
   *             HDFS path should be like "hdfs://[host]:[port]/xxx"
   *             Amazon S3 path should be like "s3a://bucket/xxx"
   * @param overWrite if overwrite
   * @return self
   */
  final def saveDefinition(path : String, overWrite: Boolean = false) : this.type = {
    this.clearState()
    ModulePersister.saveModelDefinitionToFile(path, this, overWrite)
    this
  }

  /**
   * Save this module to path in torch7 readable format
   * @param path
   * @param overWrite
   * @return
   */
  final def saveTorch(path : String, overWrite: Boolean = false) : this.type = {
    this.clearState()
    File.saveTorch(this, path, TYPE_MODULE, overWrite)
    this
  }

  /**
   * Save this module to path in caffe readable format
   * @param prototxtPath
   * @param modelPath
   * @param useV2
   * @param overwrite
   * @return
   */
  final def saveCaffe(prototxtPath: String, modelPath: String,
    useV2 : Boolean = true, overwrite : Boolean = false) : this.type = {
    this.clearState()
    CaffePersister.persist[T](prototxtPath, modelPath, this, useV2, overwrite)
    this
  }

  /**
   * Save this module to path in tensorflow readable format
   * @param inputs
   * @param path
   * @param byteOrder
   * @param dataFormat
   * @return
   */
  final def saveTF(
    inputs : Seq[(String, Seq[Int])],
    path: String,
    byteOrder: ByteOrder = ByteOrder.LITTLE_ENDIAN,
    dataFormat: TensorflowDataFormat = TensorflowDataFormat.NHWC): this.type = {
    Log4Error.invalidInputError(this.isInstanceOf[Graph[T]],
      "only Graph container can be saved as Tensorflow model")
    this.clearState()
    val inTrainMode = train
    if (inTrainMode) {
      this.evaluate()
    }
    TensorflowSaver.saveGraph(this.asInstanceOf[Graph[T]], inputs, path, byteOrder, dataFormat)
    if (inTrainMode) {
      this.training()
    }
    this
  }

  /**
   * Operations before get model
   * @return
   */
  def beforeGetModel(): Unit = {}

  /**
   * Get numeric type of module parameters
   * @return
   */
  final def getNumericType(): TensorDataType = {
    ev.getType()
  }

  /**
   * module predict, return the probability distribution
   * @param dataset dataset for prediction
   * @param batchSize total batchSize for all partitions.
   *                  if -1, default is 4 * partitionNumber of datatset
   * @param shareBuffer whether to share same memory for each batch predict results
   */
  final def predict(dataset: RDD[Sample[T]],
    batchSize: Int = -1,
    shareBuffer: Boolean = false): RDD[Activity] = {
    Predictor(this).predict(dataset, batchSize, shareBuffer)
  }

  /**
   * module predict, return the predict label
   * @param dataset dataset for prediction
   * @param batchSize total batchSize for all partitions.
   *                  if -1, default is 4 * partitionNumber of dataset
   */

  final def predictClass(dataset: RDD[Sample[T]], batchSize: Int = -1): RDD[Int] = {
    Predictor(this).predictClass(dataset, batchSize)
  }

  /**
   * model predict images, return imageFrame with predicted tensor,
   * if you want to call predictImage multiple times,
   * it is recommended to use Predictor for DistributedImageFrame
   * or LocalPredictor for LocalImageFrame
   * @param imageFrame imageFrame that contains images
   * @param outputLayer if outputLayer is not null, the output of layer that matches
   *                      outputLayer will be used as predicted output
   * @param shareBuffer whether to share same memory for each batch predict results
   * @param batchPerPartition batch size per partition, default is 4
   * @param predictKey key to store predicted result
   * @param featurePaddingParam featurePaddingParam if the inputs have variant size
   * @return
   */
  final def predictImage(imageFrame: ImageFrame,
    outputLayer: String = null,
    shareBuffer: Boolean = false,
    batchPerPartition: Int = 4,
    predictKey: String = ImageFeature.predict,
    featurePaddingParam: Option[PaddingParam[T]] = None): ImageFrame = {
    imageFrame match {
      case distributedImageFrame: DistributedImageFrame =>
        Predictor(this, featurePaddingParam, batchPerPartition)
          .predictImage(distributedImageFrame, outputLayer, shareBuffer, predictKey)
      case localImageFrame: LocalImageFrame =>
        val predictor = LocalPredictor(this, featurePaddingParam, batchPerPartition)
        val imageFrame = predictor.predictImage(localImageFrame, outputLayer, shareBuffer,
          predictKey)
        predictor.shutdown()
        imageFrame
    }
  }

  /**
   * Set weight and bias for the module
   * @param newWeights array of weights and bias
   * @return
   */
  final def setWeightsBias(newWeights: Array[Tensor[T]]): this.type = {
    Log4Error.invalidInputError(parameters() != null, "this layer does not have weight/bias")
    Log4Error.invalidInputError(parameters()._1.length == newWeights.length,
      "the number of input weight/bias is not consistant with " +
        "number of weight/bias of this layer, " +
        s"number of input ${parameters()._1.length}," +
        s" number of output ${newWeights.length}")
    val weights = parameters()._1
    for(i <- newWeights.indices) {
      // TODO: enable this checking as we don't respect shape right now.
      //      Log4Error.invalidInputError(weights(i).size().deep == newWeights(i).size().deep,
      //        s"Mismatch shape, ${weights(i).size().mkString(",")}" +
      //          s" vs ${newWeights(i).size().mkString(",")} ")
      weights(i).copy(newWeights(i))
    }
    this
  }

  /**
   * Get weight and bias for the module
   * @return array of weights and bias
   *
   */
  final def getWeightsBias(): Array[Tensor[T]] = {
    if (parameters() != null) {
      parameters()._1
    } else {
      null
    }
  }

  /**
   * save weights and bias to file
   * @param path file to save
   * @param overWrite whether to overwrite or not
   */
  final def saveWeights(path: String, overWrite: Boolean): Unit = {
    val parameterTable = getParametersTable()
    val weightsBiasTable = T()
    parameterTable.foreach {
      case (name: String, params: Table) =>
        val wb = T()
        if (params.contains("weight")) {
          wb("weight") = params("weight")
        }
        if (params.contains("bias")) {
          wb("bias") = params("bias")
        }
        weightsBiasTable(name) = wb
      case _ =>
        Log4Error.invalidOperationError(false, "invalid parameter table")
    }
    weightsBiasTable.save(path, overWrite)
  }

  /**
   * load pretrained weights and bias to current module
   * @param weightPath file to store weights and bias
   * @param matchAll whether to match all layers' weights and bias,
   *                 if not, only load existing pretrained weights and bias
   * @return current module
   */
  final def loadWeights(weightPath: String, matchAll: Boolean = true): this.type = {
    val srcParameter = File.load[Table](weightPath)
    val targetParameter = getParametersTable()
    copyWeights(targetParameter, srcParameter, matchAll)
    this
  }

  /**
   * copy weights from another model, mapping by layer name
   * @param srcModel model to copy from
   * @param matchAll whether to match all layers' weights and bias,
   * @return current module
   */
  final def loadModelWeights(srcModel: Module[Float], matchAll: Boolean = true): this.type = {
    val srcParameter = srcModel.getParametersTable()
    val targetParameter = getParametersTable()
    copyWeights(targetParameter, srcParameter, matchAll)
    this
  }

  protected def processInputs(nodes: Seq[ModuleNode[T]]): ModuleNode[T] = {
    val curNode = new ModuleNode[T](this)
    nodes.foreach(node => {
      node.add(curNode, Edge())
    })
    curNode
  }

  protected def processInputs(first: (ModuleNode[T], Int),
      nodesWithIndex : (ModuleNode[T], Int)*): ModuleNode[T] = {
    val curNode = new ModuleNode[T](this)
    first._1.add(curNode, Edge(first._2))
    nodesWithIndex.foreach(nodeWithIndex => {
      nodeWithIndex._1.add(curNode, Edge(nodeWithIndex._2))
    })
    curNode
  }

  /**
   * Build graph: some other modules point to current module
   * @param nodes upstream module nodes
   * @return node containing current module
   */
  def inputs(nodes : ModuleNode[T]*): ModuleNode[T] = {
    validateInput(nodes.map(_.element))
    processInputs(nodes)
  }

  /**
   * Build graph: some other modules point to current module
   * @param nodes upstream module nodes in an array
   * @return node containing current module
   */
  def inputs(nodes : Array[ModuleNode[T]]): ModuleNode[T] = {
    validateInput(nodes.map(_.element))
    processInputs(nodes)
  }

  /**
   * Build graph: some other modules point to current module
   * @param first distinguish from another inputs when input parameter list is empty
   * @param nodesWithIndex upstream module nodes and the output tensor index. The start index is 1.
   * @return node containing current module
   */
  def inputs(first: (ModuleNode[T], Int), nodesWithIndex : (ModuleNode[T], Int)*): ModuleNode[T] = {
    validateInput(List(first._1.element))
    validateInput(nodesWithIndex.map(_._1.element))
    processInputs(first, nodesWithIndex: _*)
  }

  /**
   * Generate graph module with start nodes
   * @param startNodes
   * @return
   */
  def toGraph(startNodes: ModuleNode[T]*): Graph[T] = {
    val starts = if (startNodes.isEmpty) Array(Input[T]()) else startNodes.toArray
    val endNodes = this.getEndNodes(starts)
    var graph = Graph(starts, endNodes)
    if (graph.isInstanceOf[StaticGraph[T]]) {
      // Merge nested graphs inside to make the whole graph non-nested
      graph = graph.asInstanceOf[StaticGraph[T]].toSingleGraph()
    }
    if (inputsFormats != null) {
      graph.setInputFormats(inputsFormats)
    }
    if (outputsFormats != null) {
      graph.setOutputFormats(outputsFormats)
    }
    graph
  }

  /**
   * Find a module with given name. If there is no module with given name, it will return None. If
   * there are multiple modules with the given name, an exception will be thrown.
   * @param name
   * @return
   */
  def apply(name : String): Option[AbstractModule[Activity, Activity, T]] = {
    if (this.getName() == name) {
      Some(this)
    } else {
      None
    }
  }

  /**
   * use ValidationMethod to evaluate module on the given rdd dataset
   * @param dataset dataset for test
   * @param vMethods validation methods
   * @param batchSize total batchsize of all partitions,
   *                  optional param and default 4 * partitionNum of dataset
   * @return
   */
  final def evaluate(
    dataset: RDD[Sample[T]],
    vMethods: Array[_ <:ValidationMethod[T]],
    batchSize: Option[Int] = None
  ): Array[(ValidationResult, ValidationMethod[T])] = {
    Evaluator(this).test(dataset, vMethods.map(v => v), batchSize)
  }


  /**
   * use ValidationMethod to evaluate module on the given rdd dataset
   * @param dataset
   * @param vMethods
   * @return
   */
  final def evaluate(
    dataset: RDD[MiniBatch[T]],
    vMethods: Array[_ <:ValidationMethod[T]]
  ): Array[(ValidationResult, ValidationMethod[T])] = {
    Evaluator(this).testMiniBatch(dataset, vMethods.map(v => v))
  }

  /**
   * use ValidationMethod to evaluate module on the given ImageFrame
   *  @param imageFrame ImageFrame for valudation
   *  @param vMethods validation methods
   *  @param batchSize total batch size of all partitions
   *  @return
   */

  final def evaluateImage(imageFrame: ImageFrame,
    vMethods: Array[_ <:ValidationMethod[T]],
    batchSize: Option[Int] = None
    ): Array[(ValidationResult, ValidationMethod[T])] = {
    Log4Error.invalidInputError(imageFrame.isDistributed(), "ImageFrame must be distributed")
    val rdd = imageFrame.toDistributed().rdd.map(imageFeature => {
      if (imageFeature.isValid) {
        Log4Error.invalidInputError(imageFeature.contains(ImageFeature.sample),
          "ImageFeature must have sample")
        imageFeature[Sample[T]](ImageFeature.sample)
      } else {
        null
      }
    }).filter(_ != null)
    evaluate(rdd, vMethods, batchSize)
  }

  /**
   * use ValidationMethod to evaluate module on the given local dataset
   * @param dataSet
   * @param vMethods
   * @return
   */
  final def evaluate(
    dataSet: LocalDataSet[MiniBatch[T]],
    vMethods: Array[_ <:ValidationMethod[T]]
  ): Array[(ValidationResult, ValidationMethod[T])] = {
    Validator(this, dataSet).test(vMethods.map(v => v))
  }

  /**
   * Quantize this module, which reduces the precision of the parameter. Get a higher speed with a
   * little accuracy cost.
   * @return
   */
  final def quantize(): Module[T] = {
    ConversionUtils.convert[T](this, true)
  }

  // ================================= Internal APIs ===========================================

  private var namePostfix = Integer.toHexString(java.util.UUID.randomUUID().hashCode())

  final private[bigdl] def getNamePostfix : String = namePostfix

  final private[bigdl] def setNamePostfix(namePostfix : String) : Unit =
    this.namePostfix = namePostfix

  /**
   * The scale of gradient weight and gradient bias
   * before gradParameters being accumulated.
   */
  protected var scaleW: Double = 1.0
  protected var scaleB: Double = 1.0

  private[nn] final def allocateAs(dest: Activity): Activity = dest match {
    case tensor: Tensor[T] => Tensor[T]()
    case table: Table => T()
    case _ =>
      Log4Error.invalidOperationError(false, "Activity only support tensor and table now")
      null
  }

  /**
   * The name of the module
   */
  private var name : String = null

  private var id: Int = 0

  private[bigdl] def setId(id: Int): Unit = {
    this.id = id
  }

  private[bigdl] def getId(): Int = this.id

  protected final def getPrintName(): String = {
    val postfix = if (name == null) {
      namePostfix
    } else {
      name
    }
    s"${this.getClass.getSimpleName}[${postfix}]"

  }

  protected var forwardTime = 0L

  protected var backwardTime = 0L

  private var scaleWCache: Double = scaleW
  private var scaleBCache: Double = scaleB

  /**
   * This function returns two tensors. One for the flattened trainable parameters flatParameters
   * and another for the gradients of the energy wrt to the trainable parameters flatGradParameters.
   *
   * Custom modules should not override this function. They should instead override parameters(...)
   * which is, in turn, called by the present function.
   *
   * This function will go over all the weights and gradWeights and make them view into a single
   * tensor (one for weights and one for gradWeights).
   *
   * @return
   */
  final private[bigdl] def getParameters(): (Tensor[T], Tensor[T]) = {
    val (weightParameters, gradParameters) = this.parameters()

    // maybe null if not weights in this module.
    Log4Error.invalidInputError(weightParameters != null && weightParameters.length > 0,
      s"model ${this.getName()} doesn't have any trainable parameters.")

    // If some gradParameters are not allocated storage, allocate it
    Log4Error.invalidInputError(weightParameters.size == gradParameters.size,
      "weights and gradient number are not match")
    weightParameters.zip(gradParameters).foreach{ case(w, g) => g.resizeAs(w)}
    (Module.flatten[T](weightParameters), Module.flatten[T](gradParameters))
  }

  /**
   * Module status. It is useful for modules like dropout/batch normalization
   */
  protected var train: Boolean = true


  protected var line = "\n"


  private val engineType: EngineType = Engine.getEngineType()

  /**
   * get execution engine type
   */
  private[bigdl] def checkEngineType(): this.type = {
    if (engineType != Engine.getEngineType()) {
      Log4Error.invalidInputError(false,
        s"Module's EngineType ${engineType} doesn't match global" +
          s" EngineType ${Engine.getEngineType()}",
        "Module's EngineType should be same with global EngineType")
    }
    this
  }

  final private def setWeightAndBias(copy : AbstractModule[A, B, T], deepCopy : Boolean): Unit = {
    val parameterTable = this.getParametersTable
    val copiedModuleParamTable = copy.getParametersTable
    if (parameterTable != null) {
      Log4Error.invalidInputError(copiedModuleParamTable != null,
        "cloned module should have params")
      parameterTable.foreach {
        case (name: String, params: Table) =>
          Log4Error.invalidInputError(copiedModuleParamTable.get(name) != None,
            s"cloned module should have for $name")
          setLayerWeightAndBias(params,
            copiedModuleParamTable.get(name).get.asInstanceOf[Table], deepCopy)
        case _ =>
          Log4Error.invalidOperationError(false, "unsupported $name and $params")
      }
    }
  }

  final private def setLayerWeightAndBias(params : Table,
                                    copyParams : Table, deepCopy : Boolean): Unit = {
    params.foreach(param => {
      copyParam(params, copyParams, deepCopy, param._1.toString)
    })
  }

  final private def copyParam(params : Table, copyParams : Table,
                        deepCopy : Boolean, paraName : String) : Unit = {
    if (params.contains(paraName)) {
      // this is for quantization tensors where the weight might be an array
      if (params.get(paraName).get
        .isInstanceOf[Array[Tensor[T]]]) {
        val copies = copyParams.get(paraName).get
          .asInstanceOf[Array[Tensor[T]]]
        val origins = params.get(paraName).get
          .asInstanceOf[Array[Tensor[T]]]
        var i = 0
        while (i < copies.length) {
          copyTensor(origins(i), copies(i), deepCopy)
          i += 1
        }
      } else {
        // For normal layers, their params are just tensors
        copyTensor(params.get(paraName).get.asInstanceOf[Tensor[T]],
          copyParams.get(paraName).get.asInstanceOf[Tensor[T]], deepCopy)
      }
    }
  }

  final private def copyTensor(t1 : Tensor[T], t2 : Tensor[T], deepCopy : Boolean) = {
    if (t2.isInstanceOf[QuantizedTensor[_]]) {
      t2.asInstanceOf[QuantizedTensor[_]].release()
    }
    if (deepCopy) {
      t2.copy(t1)
    } else {
      t2.set(t1)
    }
  }

  final private def copyWeights(target: Table, src: Table, matchAll: Boolean): Unit = {
    target.foreach {
      case (name: String, targetParams: Table) =>
        if (src.contains(name)) {
          val srcParams = src[Table](name)
          if (srcParams.contains("weight")) {
            val w = srcParams[Tensor[T]]("weight")
            targetParams[Tensor[T]]("weight").resizeAs(w).copy(w)
          }
          if (srcParams.contains("bias")) {
            val b = srcParams[Tensor[T]]("bias")
            targetParams[Tensor[T]]("bias").resizeAs(b).copy(b)
          }
        } else {
          if (matchAll) new Exception(s"module $name cannot find corresponding weight bias")
        }
      case _ =>
        Log4Error.invalidOperationError(false, "unsupported $name and $targetParams")
    }
  }

  private[bigdl] def canEqual(other: Any): Boolean = other.isInstanceOf[AbstractModule[A, B, T]]


  /**
   * Generate end nodes of current module with start nodes
   * @param startNodes: current start nodes
   * @return current end nodes
   */
  private[bigdl] def getEndNodes(startNodes: Array[ModuleNode[T]]): Array[ModuleNode[T]] = {
    val endNodes = Array(this.processInputs(startNodes))
    endNodes
  }

  /**
   * Return classTag numerics for module serialization. If your module contains multiple classtag
   * in the constructor, you should override this method
   * @return
   */
  private[bigdl] def getClassTagNumerics() : (Array[ClassTag[_]], Array[TensorNumeric[_]]) = {
    (Array(scala.reflect.classTag[T]), Array(ev))
  }

  /**
   * Check if some module is duplicated in the model. For a layer it cannot be duplicated.
   * Container should override this method
   */
  private[bigdl] def checkDuplicate(
    record: mutable.HashSet[Int] = mutable.HashSet()
  ): Unit = {
    val errMsg = "Some module is duplicate in the current model: "
    val curId = System.identityHashCode(this)
    Log4Error.invalidInputError(this.skipDuplicateCheck()
      || !record.contains(curId), errMsg + this.getName())
    record.add(curId)
  }

  /**
   * Sometimes, some layer need skip the duplicate check process, e.g. Keras-like input layer
   * @return
   */
  private[nn] def skipDuplicateCheck(): Boolean = false

  /**
   * if the model contains native resources such as aligned memory, we should release it by manual.
   * JVM GC can't release them reliably.
   */
  def release(): Unit = {}


  /**
   * parameter synchronizer for gradient synchronization
   */
  private var _parameterSynchronizer: DistriParameterSynchronizer[T] = null

  /**
   * set parameter synchronizer
   * @param parameterSynchronizer parameter synchronizer
   */
  private[bigdl] def setParameterSynchronizer(parameterSynchronizer:
    DistriParameterSynchronizer[T]): Unit = {
    _parameterSynchronizer = parameterSynchronizer
  }


  /**
   * get parameter synchronizer
   * @return parameter synchronizer
   */
  private[bigdl] def getParameterSynchronizer():
    DistriParameterSynchronizer[T] = _parameterSynchronizer


  private var _optimMethod: OptimMethod[T] = null

  /**
   * set optim method
   */

  private[bigdl] def setOptimMethod(optimMethod: OptimMethod[T]): Unit = {
    _optimMethod = optimMethod
  }

  /**
   * get optim method for layer
   */

  private[bigdl] def getOptimMethod(): OptimMethod[T] = _optimMethod

  private[bigdl] def updateParameter(): Unit = {
    if (this.getParameterSynchronizer() != null && this.isTraining) {
      if (this.parameters() != null) {
        val before = System.nanoTime()
        val (weights, grads) = this.getParameterSynchronizer.get(this.getName)
        val syncEndTime = System.nanoTime()
        if (grads != null) {
          val optimMethod = this.getOptimMethod
          Log4Error.invalidInputError(optimMethod != null,
            s"optim method for ${this.getName} cannot be null")
          optimMethod.optimize(_ => (ev.fromType(0.0f), grads),
            weights)
          this.zeroGradParameters
        }
      }
    }
  }
}