org.apache.spark.ml.odkl.CRRSampler.scala Maven / Gradle / Ivy

Go to download
package org.apache.spark.ml.odkl

import java.util.concurrent.ThreadLocalRandom

import odkl.analysis.spark.util.RDDOperations._
import odkl.analysis.spark.util.collection.CompactBuffer
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.{Estimator, Model, Transformer}
import org.apache.spark.ml.param.shared.{HasInputCol, HasLabelCol}
import org.apache.spark.ml.param.{DoubleParam, IntParam, ParamMap, Params}
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset, Row, functions}


/**
  * Follows ideas from Combined Regression and Ranking paper (http://www.decom.ufop.br/menotti/rp122/sem/sem2-alex-art.pdf)
  *
  * Can model pair-wise ranking task (sample pairs, substract features and label 1/0), can model point-wise regression,
  * or can combine both by choosing whenever to sample single item or a pair. 
  */
trait CRRSamplerParams extends HasInputCol with HasGroupByColumns with HasLabelCol {

  val groupSampleRate = new DoubleParam(this, "groupSampleRate",
    "The percentage of lists (users) to keep in the taken sample.", (x: Double) => x > 0 && x <= 1)
  val itemSampleRate = new DoubleParam(this, "itemSampleRate",
    "The percentage of items to keep in the sample (comparing to the amount of items, not to amount of possible pairs).", (x: Double) => x > 0 && x <= 1)
  val rankingPower = new DoubleParam(this, "rankingPower",
    "The percentage of paired sample (for each selected item we decide whenever to return it as is or to select a pair.", (x: Double) => x >= 0 && x <= 1)
  val shuffleToPartitions = new IntParam(this, "shuffleToPartitions",
    "Since after sampling the size might decrease, it is worth to shuffle dataset to fewer partitions. Shuffle also can improve" +
      "convergence by providing better order randomization.", (x: Int) => x > 0)

  def setGroupSampleRate(value: Double): this.type = set(groupSampleRate, value)

  def setItemSampleRate(value: Double): this.type = set(itemSampleRate, value)

  def setRankingPower(value: Double): this.type = set(rankingPower, value)

  def setShufflerToPartitions(value: Int) : this.type = set(shuffleToPartitions, value)

  setDefault(
    rankingPower -> 0.0,
    inputCol -> "features",
    labelCol -> "label"
  )
}

/**
  * Model applied as a transformer, but the resulting data set is not determenistic (each pass produces different
  * results). Results must not be cached.
  */
class CRRSamplerModel(override val uid: String) extends
  Model[CRRSamplerModel] with DefaultParamsWritable with CRRSamplerParams with HasNetlibBlas{

  def this() = this(Identifiable.randomUID("crrSampler"))

  setDefault(
    groupSampleRate -> 1.0,
    itemSampleRate -> 1.0
  )

  override def transform(dataset: Dataset[_]): DataFrame = {

    val (data, keyIndex, toDrop) = if (isDefined(groupByColumns) && $(groupByColumns).length > 0) {
      if ($(groupByColumns).length == 1) {
        (dataset, dataset.schema.fieldIndex($(groupByColumns).head), None)
      } else {
        val keyStruct = functions.struct($(groupByColumns).map(x => dataset(x)): _*)
        val keyName = s"${uid}_tmpKey"
        val data = dataset.withColumn(keyName, keyStruct)

        (data, data.schema.fieldIndex(keyName), Some(keyName))
      }
    } else {
      val keyName = s"${uid}_tmpKey"
      val data = dataset.withColumn(keyName, functions.lit(keyName))

      (data, data.schema.fieldIndex(keyName), Some(keyName))
    }

    val featuresIndex = data.schema.fieldIndex($(inputCol))
    val labelIndex = data.schema.fieldIndex($(labelCol))


    val result = dataset.sqlContext.createDataFrame(
      data.toDF
        .rdd
        .groupWithinPartitionsBy(x => x.get(keyIndex))
        .flatMap(x => sampleRows(x._2, labelIndex, featuresIndex)),
      data.schema
    )

    val noExtraColumn = toDrop.map(x => result.drop(x)).getOrElse(result)

    get(shuffleToPartitions).map(x => noExtraColumn.repartition(x)).getOrElse(noExtraColumn)
  }

  def sampleRows(rows: Iterator[Row], labelIndex: Int, featureIndex: Int): Iterator[Row] = {
    if ($(groupSampleRate) < 1 && ThreadLocalRandom.current().nextDouble() > $(groupSampleRate)) {
      // Skip entire group
      Iterator.empty
    } else if ($(rankingPower) > 0) {

      val positives = new CompactBuffer[Row]()
      val negatives = new CompactBuffer[Row]()

      rows.foreach(x => if (x.getDouble(labelIndex) > 0) {
        positives += x
      } else {
        negatives += x
      })

      if (positives.isEmpty || negatives.isEmpty) {
        // Could not use single class groups when ranking part is enabled.
        Iterator.empty
      } else {
        counterSample(positives.iterator, negatives, featureIndex) ++ counterSample(negatives.iterator, positives, featureIndex)
      }
    } else {
      counterSample(rows, Seq.empty, featureIndex)
    }
  }

  def counterSample(source: Iterator[Row], counterPart: Seq[Row], featureIndex: Int): Iterator[Row] = {
    val sampled = if ($(itemSampleRate) < 1) {
      source.filter(_ => ThreadLocalRandom.current().nextDouble() < $(itemSampleRate))
    } else {
      source
    }

    if ($(rankingPower) <= 0) {
      // Pure regression
      sampled
    } else if ($(rankingPower) < 1) {
      // Pure ranking
      sampled.map(x => pairSample(x, counterPart, featureIndex))
    } else {
      // Mixture
      sampled.map(x => {
        if (ThreadLocalRandom.current().nextDouble() < $(rankingPower)) {
          pairSample(x, counterPart, featureIndex)
        } else {
          x
        }
      })
    }
  }

  def pairSample(sample: Row, counterPart: Seq[Row], featureIndex: Int): Row = {
    val counterSample: Vector = counterPart(ThreadLocalRandom.current().nextInt(counterPart.size)).getAs[Vector](featureIndex)

    val result = Array.tabulate(sample.length) { i =>
      if (i == featureIndex) {
        val features = sample.getAs[Vector](featureIndex)

        val result = Vectors.zeros(features.size).toDense

        axpy(1.0, features, result.values)
        axpy(-1.0, counterSample, result.values)

        result
      } else {
        sample.get(i)
      }
    }

    Row.fromSeq(result)
  }

  override def copy(extra: ParamMap): CRRSamplerModel = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = schema
}

/**
  * Estimator is used to select the proper item sample rate to achive desired size of the resulting
  * sample. Takes into consideration the source dataset size and the amount of valid for ranking lists
  * (list with samples of different rank).
  */
class CRRSamplerEstimator(override val uid: String) extends Estimator[CRRSamplerModel]
  with DefaultParamsWritable with CRRSamplerParams {

  val expectedNumSamples = new IntParam(this, "expectedNumSamples",
    "The expected number of samples in the result. Required.", (x: Int) => x > 0)

  def setExpectedNumberOfSamples(value: Int) : this.type = set(expectedNumSamples, value)

  setDefault(
    groupSampleRate -> 1.0
  )

  def this() = this(Identifiable.randomUID("crrSamplerEstimator"))

  override def fit(dataset: Dataset[_]): CRRSamplerModel = {
    val totalSamples: Double = if ($(rankingPower) > 0 && isDefined(groupByColumns) && $(groupByColumns).length > 0) {
      val (withKey, keyIndex) = if ($(groupByColumns).length == 1) {
        (dataset, dataset.schema.fieldIndex($(groupByColumns).head))
      } else {
        val key = functions.struct($(groupByColumns).map(x => dataset(x)): _*)
        val keyName = s"${uid}_tmpKey"
        val data = dataset.withColumn(keyName, key)

        (data, data.schema.fieldIndex(keyName))
      }

      val labelIndex = withKey.schema.fieldIndex($(labelCol))

      withKey.toDF
        .rdd
        .groupWithinPartitionsBy(x => x.get(keyIndex))
        .map(x => {
          var numPositives: Long = 0
          var numNegatives: Long = 0
          x._2.foreach(row => if (row.getDouble(labelIndex) > 0) numPositives += 1 else numNegatives += 1)

          if(numNegatives > 0 && numPositives > 0) numNegatives + numPositives else 0
        }).sum()
    } else {
      dataset.count()
    }

    val discountedByGroupRate = totalSamples * $(groupSampleRate)

    val requiredItemSampleRate = Math.min(1.0, $(expectedNumSamples) / discountedByGroupRate)

    logInfo(s"Estimated total number of samples $totalSamples, after groups sampling $discountedByGroupRate. Chosen item sample rate is $requiredItemSampleRate")

    val model: CRRSamplerModel = copyValues(new CRRSamplerModel(), ParamMap(itemSampleRate -> requiredItemSampleRate))
    model.setParent(this)
  }

  override def copy(extra: ParamMap): CRRSamplerEstimator = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = schema
}