org.apache.spark.ml.classification.NaiveBayes.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of spark-mllib_2.13 Show documentation
There is a newer version: 4.0.0-preview2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.ml.classification

import org.apache.hadoop.fs.Path
import org.json4s.DefaultFormats

import org.apache.spark.annotation.Since
import org.apache.spark.ml.PredictorParams
import org.apache.spark.ml.functions.checkNonNegativeWeight
import org.apache.spark.ml.impl.Utils
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.HasWeightCol
import org.apache.spark.ml.stat.Summarizer
import org.apache.spark.ml.util._
import org.apache.spark.ml.util.Instrumentation.instrumented
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.util.VersionUtils

/**
 * Params for Naive Bayes Classifiers.
 */
private[classification] trait NaiveBayesParams extends PredictorParams with HasWeightCol {

  /**
   * The smoothing parameter.
   * (default = 1.0).
   * @group param
   */
  final val smoothing: DoubleParam = new DoubleParam(this, "smoothing", "The smoothing parameter.",
    ParamValidators.gtEq(0))

  /** @group getParam */
  final def getSmoothing: Double = $(smoothing)

  /**
   * The model type which is a string (case-sensitive).
   * Supported options: "multinomial", "complement", "bernoulli", "gaussian".
   * (default = multinomial)
   * @group param
   */
  final val modelType: Param[String] = new Param[String](this, "modelType", "The model type " +
    "which is a string (case-sensitive). Supported options: multinomial (default), complement, " +
    "bernoulli and gaussian.",
    ParamValidators.inArray[String](NaiveBayes.supportedModelTypes.toArray))

  /** @group getParam */
  final def getModelType: String = $(modelType)

  setDefault(smoothing -> 1.0, modelType -> NaiveBayes.Multinomial)
}

// scalastyle:off line.size.limit
/**
 * Naive Bayes Classifiers.
 * It supports Multinomial NB
 * (see 
 * here)
 * which can handle finitely supported discrete data. For example, by converting documents into
 * TF-IDF vectors, it can be used for document classification. By making every vector a
 * binary (0/1) data, it can also be used as Bernoulli NB
 * (see 
 * here).
 * The input feature values for Multinomial NB and Bernoulli NB must be nonnegative.
 * Since 3.0.0, it supports Complement NB which is an adaptation of the Multinomial NB. Specifically,
 * Complement NB uses statistics from the complement of each class to compute the model's coefficients
 * The inventors of Complement NB show empirically that the parameter estimates for CNB are more stable
 * than those for Multinomial NB. Like Multinomial NB, the input feature values for Complement NB must
 * be nonnegative.
 * Since 3.0.0, it also supports Gaussian NB
 * (see 
 * here)
 * which can handle continuous data.
 */
// scalastyle:on line.size.limit
@Since("1.5.0")
class NaiveBayes @Since("1.5.0") (
    @Since("1.5.0") override val uid: String)
  extends ProbabilisticClassifier[Vector, NaiveBayes, NaiveBayesModel]
  with NaiveBayesParams with DefaultParamsWritable {

  import NaiveBayes._

  @Since("1.5.0")
  def this() = this(Identifiable.randomUID("nb"))

  /**
   * Set the smoothing parameter.
   * Default is 1.0.
   * @group setParam
   */
  @Since("1.5.0")
  def setSmoothing(value: Double): this.type = set(smoothing, value)

  /**
   * Set the model type using a string (case-sensitive).
   * Supported options: "multinomial", "complement", "bernoulli", and "gaussian".
   * Default is "multinomial"
   * @group setParam
   */
  @Since("1.5.0")
  def setModelType(value: String): this.type = set(modelType, value)

  /**
   * Sets the value of param [[weightCol]].
   * If this is not set or empty, we treat all instance weights as 1.0.
   * Default is not set, so all instances have weight one.
   *
   * @group setParam
   */
  @Since("2.1.0")
  def setWeightCol(value: String): this.type = set(weightCol, value)

  override protected def train(dataset: Dataset[_]): NaiveBayesModel = {
    trainWithLabelCheck(dataset, positiveLabel = true)
  }

  /**
   * ml assumes input labels in range [0, numClasses). But this implementation
   * is also called by mllib NaiveBayes which allows other kinds of input labels
   * such as {-1, +1}. `positiveLabel` is used to determine whether the label
   * should be checked and it should be removed when we remove mllib NaiveBayes.
   */
  private[spark] def trainWithLabelCheck(
      dataset: Dataset[_],
      positiveLabel: Boolean): NaiveBayesModel = instrumented { instr =>
    instr.logPipelineStage(this)
    instr.logDataset(dataset)
    instr.logParams(this, labelCol, featuresCol, weightCol, predictionCol, rawPredictionCol,
      probabilityCol, modelType, smoothing, thresholds)

    if (positiveLabel && isDefined(thresholds)) {
      val numClasses = getNumClasses(dataset)
      instr.logNumClasses(numClasses)
      require($(thresholds).length == numClasses, this.getClass.getSimpleName +
        ".train() called with non-matching numClasses and thresholds.length." +
        s" numClasses=$numClasses, but thresholds has length ${$(thresholds).length}")
    }

    $(modelType) match {
      case Bernoulli | Multinomial | Complement =>
        trainDiscreteImpl(dataset, instr)
      case Gaussian =>
        trainGaussianImpl(dataset, instr)
      case _ =>
        // This should never happen.
        throw new IllegalArgumentException(s"Invalid modelType: ${$(modelType)}.")
    }
  }

  private def trainDiscreteImpl(
      dataset: Dataset[_],
      instr: Instrumentation): NaiveBayesModel = {
    val spark = dataset.sparkSession
    import spark.implicits._

    val validateUDF = $(modelType) match {
      case Multinomial | Complement =>
        udf { vector: Vector => requireNonnegativeValues(vector); vector }
      case Bernoulli =>
        udf { vector: Vector => requireZeroOneBernoulliValues(vector); vector }
    }

    val w = if (isDefined(weightCol) && $(weightCol).nonEmpty) {
      checkNonNegativeWeight(col($(weightCol)).cast(DoubleType))
    } else {
      lit(1.0)
    }

    // Aggregates term frequencies per label.
    val aggregated = dataset.groupBy(col($(labelCol)))
      .agg(sum(w).as("weightSum"), Summarizer.metrics("sum", "count")
        .summary(validateUDF(col($(featuresCol))), w).as("summary"))
      .select($(labelCol), "weightSum", "summary.sum", "summary.count")
      .as[(Double, Double, Vector, Long)]
      .collect().sortBy(_._1)

    val numFeatures = aggregated.head._3.size
    instr.logNumFeatures(numFeatures)
    val numSamples = aggregated.map(_._4).sum
    instr.logNumExamples(numSamples)
    val numLabels = aggregated.length
    instr.logNumClasses(numLabels)
    val numDocuments = aggregated.map(_._2).sum
    instr.logSumOfWeights(numDocuments)

    val labelArray = new Array[Double](numLabels)
    val piArray = new Array[Double](numLabels)
    val thetaArray = new Array[Double](numLabels * numFeatures)

    val aggIter = $(modelType) match {
      case Multinomial | Bernoulli => aggregated.iterator
      case Complement =>
        val featureSum = Vectors.zeros(numFeatures)
        aggregated.foreach { case (_, _, sumTermFreqs, _) =>
          BLAS.axpy(1.0, sumTermFreqs, featureSum)
        }
        aggregated.iterator.map { case (label, n, sumTermFreqs, count) =>
          val comp = featureSum.copy
          BLAS.axpy(-1.0, sumTermFreqs, comp)
          (label, n, comp, count)
        }
    }

    val lambda = $(smoothing)
    val piLogDenom = math.log(numDocuments + numLabels * lambda)
    var i = 0
    aggIter.foreach { case (label, n, sumTermFreqs, _) =>
      labelArray(i) = label
      piArray(i) = math.log(n + lambda) - piLogDenom
      val thetaLogDenom = $(modelType) match {
        case Multinomial | Complement =>
          math.log(sumTermFreqs.toArray.sum + numFeatures * lambda)
        case Bernoulli => math.log(n + 2.0 * lambda)
      }
      var j = 0
      val offset = i * numFeatures
      while (j < numFeatures) {
        thetaArray(offset + j) = math.log(sumTermFreqs(j) + lambda) - thetaLogDenom
        j += 1
      }
      i += 1
    }

    val pi = Vectors.dense(piArray)
    $(modelType) match {
      case Multinomial | Bernoulli =>
        val theta = new DenseMatrix(numLabels, numFeatures, thetaArray, true)
        new NaiveBayesModel(uid, pi.compressed, theta.compressed, Matrices.zeros(0, 0))
          .setOldLabels(labelArray)
      case Complement =>
        // Since the CNB compute the coefficient in a complement way.
        val theta = new DenseMatrix(numLabels, numFeatures, thetaArray.map(v => -v), true)
        new NaiveBayesModel(uid, pi.compressed, theta.compressed, Matrices.zeros(0, 0))
    }
  }

  private def trainGaussianImpl(
      dataset: Dataset[_],
      instr: Instrumentation): NaiveBayesModel = {
    val spark = dataset.sparkSession
    import spark.implicits._

    val w = if (isDefined(weightCol) && $(weightCol).nonEmpty) {
      checkNonNegativeWeight(col($(weightCol)).cast(DoubleType))
    } else {
      lit(1.0)
    }

    // Aggregates mean vector and square-sum vector per label.
    val aggregated = dataset.groupBy(col($(labelCol)))
      .agg(sum(w).as("weightSum"), Summarizer.metrics("mean", "normL2")
        .summary(col($(featuresCol)), w).as("summary"))
      .select($(labelCol), "weightSum", "summary.mean", "summary.normL2")
      .as[(Double, Double, Vector, Vector)]
      .map { case (label, weightSum, mean, normL2) =>
        (label, weightSum, mean, Vectors.dense(normL2.toArray.map(v => v * v)))
      }.collect().sortBy(_._1)

    val numFeatures = aggregated.head._3.size
    instr.logNumFeatures(numFeatures)

    val numLabels = aggregated.length
    instr.logNumClasses(numLabels)

    val numInstances = aggregated.map(_._2).sum
    instr.logSumOfWeights(numInstances)

    // If the ratio of data variance between dimensions is too small, it
    // will cause numerical errors. To address this, we artificially
    // boost the variance by epsilon, a small fraction of the standard
    // deviation of the largest dimension.
    // Refer to scikit-learn's implementation
    // [https://github.com/scikit-learn/scikit-learn/blob/0.21.X/sklearn/naive_bayes.py#L348]
    // and discussion [https://github.com/scikit-learn/scikit-learn/pull/5349] for detail.
    val epsilon = Iterator.range(0, numFeatures).map { j =>
      var globalSum = 0.0
      var globalSqrSum = 0.0
      aggregated.foreach { case (_, weightSum, mean, squareSum) =>
        globalSum += mean(j) * weightSum
        globalSqrSum += squareSum(j)
      }
      globalSqrSum / numInstances -
        globalSum * globalSum / numInstances / numInstances
    }.max * 1e-9

    val piArray = new Array[Double](numLabels)

    // thetaArray in Gaussian NB store the means of features per label
    val thetaArray = new Array[Double](numLabels * numFeatures)

    // thetaArray in Gaussian NB store the variances of features per label
    val sigmaArray = new Array[Double](numLabels * numFeatures)

    var i = 0
    val logNumInstances = math.log(numInstances)
    aggregated.foreach { case (_, weightSum, mean, squareSum) =>
      piArray(i) = math.log(weightSum) - logNumInstances
      var j = 0
      val offset = i * numFeatures
      while (j < numFeatures) {
        val m = mean(j)
        thetaArray(offset + j) = m
        sigmaArray(offset + j) = epsilon + squareSum(j) / weightSum - m * m
        j += 1
      }
      i += 1
    }

    val pi = Vectors.dense(piArray)
    val theta = new DenseMatrix(numLabels, numFeatures, thetaArray, true)
    val sigma = new DenseMatrix(numLabels, numFeatures, sigmaArray, true)
    new NaiveBayesModel(uid, pi.compressed, theta.compressed, sigma.compressed)
  }

  @Since("1.5.0")
  override def copy(extra: ParamMap): NaiveBayes = defaultCopy(extra)
}

@Since("1.6.0")
object NaiveBayes extends DefaultParamsReadable[NaiveBayes] {
  /** String name for multinomial model type. */
  private[classification] val Multinomial: String = "multinomial"

  /** String name for Bernoulli model type. */
  private[classification] val Bernoulli: String = "bernoulli"

  /** String name for Gaussian model type. */
  private[classification] val Gaussian: String = "gaussian"

  /** String name for Complement model type. */
  private[classification] val Complement: String = "complement"

  /* Set of modelTypes that NaiveBayes supports */
  private[classification] val supportedModelTypes =
    Set(Multinomial, Bernoulli, Gaussian, Complement)

  private[ml] def requireNonnegativeValues(v: Vector): Unit = {
    require(v.nonZeroIterator.forall(_._2 > 0.0),
      s"Naive Bayes requires nonnegative feature values but found $v.")
  }

  private[ml] def requireZeroOneBernoulliValues(v: Vector): Unit = {
    require(v.nonZeroIterator.forall(_._2 == 1.0),
      s"Bernoulli naive Bayes requires 0 or 1 feature values but found $v.")
  }

  @Since("1.6.0")
  override def load(path: String): NaiveBayes = super.load(path)
}

/**
 * Model produced by [[NaiveBayes]]
 *
 * @param pi    log of class priors, whose dimension is C (number of classes)
 * @param theta log of class conditional probabilities, whose dimension is C (number of classes)
 *              by D (number of features)
 * @param sigma variance of each feature, whose dimension is C (number of classes)
 *              by D (number of features). This matrix is only available when modelType
 *              is set Gaussian.
 */
@Since("1.5.0")
class NaiveBayesModel private[ml] (
    @Since("1.5.0") override val uid: String,
    @Since("2.0.0") val pi: Vector,
    @Since("2.0.0") val theta: Matrix,
    @Since("3.0.0") val sigma: Matrix)
  extends ProbabilisticClassificationModel[Vector, NaiveBayesModel]
  with NaiveBayesParams with MLWritable {

  import NaiveBayes._

  /**
   * mllib NaiveBayes is a wrapper of ml implementation currently.
   * Input labels of mllib could be {-1, +1} and mllib NaiveBayesModel exposes labels,
   * both of which are different from ml, so we should store the labels sequentially
   * to be called by mllib. This should be removed when we remove mllib NaiveBayes.
   */
  private[spark] var oldLabels: Array[Double] = null

  private[spark] def setOldLabels(labels: Array[Double]): this.type = {
    this.oldLabels = labels
    this
  }

  /**
   * Bernoulli scoring requires log(condprob) if 1, log(1-condprob) if 0.
   * This precomputes log(1.0 - exp(theta)) and its sum which are used for the linear algebra
   * application of this condition (in predict function).
   */
  @transient private lazy val thetaMinusNegTheta = $(modelType) match {
    case Bernoulli =>
      theta.map(value => value - math.log1p(-math.exp(value)))
    case _ =>
      // This should never happen.
      throw new IllegalArgumentException(s"Invalid modelType: ${$(modelType)}. " +
        "Variables thetaMinusNegTheta should only be precomputed in Bernoulli NB.")
  }

  @transient private lazy val piMinusThetaSum = $(modelType) match {
    case Bernoulli =>
      val negTheta = theta.map(value => math.log1p(-math.exp(value)))
      val ones = new DenseVector(Array.fill(theta.numCols)(1.0))
      val piMinusThetaSum = pi.toDense.copy
      BLAS.gemv(1.0, negTheta, ones, 1.0, piMinusThetaSum)
      piMinusThetaSum
    case _ =>
      // This should never happen.
      throw new IllegalArgumentException(s"Invalid modelType: ${$(modelType)}. " +
        "Variables piMinusThetaSum should only be precomputed in Bernoulli NB.")
  }

  /**
   * Gaussian scoring requires sum of log(Variance).
   * This precomputes sum of log(Variance) which are used for the linear algebra
   * application of this condition (in predict function).
   */
  @transient private lazy val logVarSum = $(modelType) match {
    case Gaussian =>
      Array.tabulate(numClasses) { i =>
        Iterator.range(0, numFeatures).map { j =>
          math.log(sigma(i, j))
        }.sum
      }
    case _ =>
      // This should never happen.
      throw new IllegalArgumentException(s"Invalid modelType: ${$(modelType)}. " +
        "Variables logVarSum should only be precomputed in Gaussian NB.")
  }

  @Since("1.6.0")
  override val numFeatures: Int = theta.numCols

  @Since("1.5.0")
  override val numClasses: Int = pi.size

  private def multinomialCalculation(features: Vector) = {
    requireNonnegativeValues(features)
    val prob = pi.toDense.copy
    BLAS.gemv(1.0, theta, features, 1.0, prob)
    prob
  }

  private def complementCalculation(features: Vector) = {
    requireNonnegativeValues(features)
    val probArray = theta.multiply(features).toArray
    // the following lines equal to:
    // val logSumExp = math.log(probArray.map(math.exp).sum)
    // However, it easily returns Infinity/NaN values.
    // Here follows 'scipy.special.logsumexp' (which is used in Scikit-Learn's ComplementNB)
    // to compute the log of the sum of exponentials of elements in a numeric-stable way.
    val max = probArray.max
    var sumExp = 0.0
    var j = 0
    while (j < probArray.length) {
      sumExp += math.exp(probArray(j) - max)
      j += 1
    }
    val logSumExp = math.log(sumExp) + max

    j = 0
    while (j < probArray.length) {
      probArray(j) = probArray(j) - logSumExp
      j += 1
    }
    Vectors.dense(probArray)
  }

  private def bernoulliCalculation(features: Vector) = {
    requireZeroOneBernoulliValues(features)
    val prob = piMinusThetaSum.copy
    BLAS.gemv(1.0, thetaMinusNegTheta, features, 1.0, prob)
    prob
  }

  private def gaussianCalculation(features: Vector) = {
    val prob = Array.ofDim[Double](numClasses)
    var i = 0
    while (i < numClasses) {
      var s = 0.0
      var j = 0
      while (j < numFeatures) {
        val d = features(j) - theta(i, j)
        s += d * d / sigma(i, j)
        j += 1
      }
      prob(i) = pi(i) - (s + logVarSum(i)) / 2
      i += 1
    }
    Vectors.dense(prob)
  }

  @transient private lazy val predictRawFunc = {
    $(modelType) match {
      case Multinomial =>
        features: Vector => multinomialCalculation(features)
      case Complement =>
        features: Vector => complementCalculation(features)
      case Bernoulli =>
        features: Vector => bernoulliCalculation(features)
      case Gaussian =>
        features: Vector => gaussianCalculation(features)
    }
  }

  @Since("3.0.0")
  override def predictRaw(features: Vector): Vector = predictRawFunc(features)

  override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
    rawPrediction match {
      case dv: DenseVector =>
        Utils.softmax(dv.values)
        dv
      case sv: SparseVector =>
        throw new RuntimeException("Unexpected error in NaiveBayesModel:" +
          " raw2probabilityInPlace encountered SparseVector")
    }
  }

  @Since("1.5.0")
  override def copy(extra: ParamMap): NaiveBayesModel = {
    copyValues(new NaiveBayesModel(uid, pi, theta, sigma).setParent(this.parent), extra)
  }

  @Since("1.5.0")
  override def toString: String = {
    s"NaiveBayesModel: uid=$uid, modelType=${$(modelType)}, numClasses=$numClasses, " +
      s"numFeatures=$numFeatures"
  }

  @Since("1.6.0")
  override def write: MLWriter = new NaiveBayesModel.NaiveBayesModelWriter(this)
}

@Since("1.6.0")
object NaiveBayesModel extends MLReadable[NaiveBayesModel] {

  @Since("1.6.0")
  override def read: MLReader[NaiveBayesModel] = new NaiveBayesModelReader

  @Since("1.6.0")
  override def load(path: String): NaiveBayesModel = super.load(path)

  /** [[MLWriter]] instance for [[NaiveBayesModel]] */
  private[NaiveBayesModel] class NaiveBayesModelWriter(instance: NaiveBayesModel) extends MLWriter {
    import NaiveBayes._

    private case class Data(pi: Vector, theta: Matrix, sigma: Matrix)

    override protected def saveImpl(path: String): Unit = {
      // Save metadata and Params
      DefaultParamsWriter.saveMetadata(instance, path, sc)
      val dataPath = new Path(path, "data").toString

      instance.getModelType match {
        case Multinomial | Bernoulli | Complement =>
          require(instance.sigma.numRows == 0 && instance.sigma.numCols == 0)
        case Gaussian =>
          require(instance.sigma.numRows != 0 && instance.sigma.numCols != 0)
      }

      val data = Data(instance.pi, instance.theta, instance.sigma)
      sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
    }
  }

  private class NaiveBayesModelReader extends MLReader[NaiveBayesModel] {

    /** Checked against metadata when loading model */
    private val className = classOf[NaiveBayesModel].getName

    override def load(path: String): NaiveBayesModel = {
      implicit val format = DefaultFormats
      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
      val (major, minor) = VersionUtils.majorMinorVersion(metadata.sparkVersion)

      val dataPath = new Path(path, "data").toString
      val data = sparkSession.read.parquet(dataPath)
      val vecConverted = MLUtils.convertVectorColumnsToML(data, "pi")

      val model = if (major.toInt < 3) {
        val Row(pi: Vector, theta: Matrix) =
          MLUtils.convertMatrixColumnsToML(vecConverted, "theta")
            .select("pi", "theta")
            .head()
        new NaiveBayesModel(metadata.uid, pi, theta, Matrices.zeros(0, 0))
      } else {
        val Row(pi: Vector, theta: Matrix, sigma: Matrix) =
          MLUtils.convertMatrixColumnsToML(vecConverted, "theta", "sigma")
            .select("pi", "theta", "sigma")
            .head()
        new NaiveBayesModel(metadata.uid, pi, theta, sigma)
      }

      metadata.getAndSetParams(model)
      model
    }
  }
}