com.salesforce.op.utils.spark.RichDataset.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of transmogrifai-features_2.11 Show documentation
AutoML library for building modular, reusable, strongly typed machine learning workflows on Spark with minimal hand tuning
The newest version!
/*
 * Copyright (c) 2017, Salesforce.com, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 *
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * * Neither the name of the copyright holder nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package com.salesforce.op.utils.spark

import com.salesforce.op.features.types._
import com.salesforce.op.features.{FeatureLike, FeatureSparkTypes, OPFeature}
import com.salesforce.op.utils.text.TextUtils
import org.apache.avro.mapred.AvroInputFormat
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DataType, Metadata, MetadataBuilder, StructType}
import org.apache.spark.ml.linalg.{Vector, Vectors}

import scala.collection.mutable.{WrappedArray => MWrappedArray}
import scala.reflect.ClassTag


/**
 * Dataset enrichment functions
 */
object RichDataset {

  import com.salesforce.op.utils.spark.RichRow._
  import com.salesforce.op.utils.spark.RichDataType._

  private[op] val vectorToArrayUDF = udf { (v: Vector) => if (v == null) null else v.toArray }
  private[op] val arrayToVectorUDF = udf { (a: MWrappedArray[Double]) =>
    if (a == null) null else Vectors.dense(a.toArray[Double]).compressed
  }
  private[op] val IsVectorMetadataKey = "isVector"
  private[op] val OriginalNameMetaKey = "originalName"
  private[op] def schemaPath(path: String): String = s"${path.stripSuffix("/")}/schema"
  private[op] def dataPath(path: String): String = s"${path.stripSuffix("/")}/data"

  private val AvroFormat = "avro"

  implicit class RichDataFrameWriter[T](w: DataFrameWriter[T]) {

    /**
     * Saves the content of the `DataFrame` in Avro format at the specified path.
     * This is equivalent to:
     * {{{
     *   format("avro").save(path)
     * }}}
     */
    def avro(path: String): Unit = w.format(AvroFormat).save(path)

  }

  implicit class RichDataFrameReader(r: DataFrameReader) {

    /**
     * Loads Avro files and returns the result as a `DataFrame`.
     * This is equivalent to:
     * {{{
     *   format("avro").load(path)
     * }}}
     */
    def avro(path: String): DataFrame = r.format(AvroFormat).load(path)

  }

  /**
   * Loads a dataframe from a saved Avro file and dataframe schema file generated by RichDataFrame.saveAvro.
   * Relies on spark-avro package for Avro file generation, which seems to have a bug/feature that makes all fields
   * nullable when you read them back.
   *
   * @param path data path
   * @return reconstructed dataframe (with all fields marked as nullable)
   */
  def loadAvro(path: String)(implicit spark: SparkSession): DataFrame = {
    val schemaStr = spark.sparkContext.textFile(schemaPath(path)).collect().mkString
    val schema = DataType.fromJson(schemaStr).asInstanceOf[StructType]
    val origNames = schema.fields.map(_.metadata.getString(OriginalNameMetaKey))
    val data = spark.read.avro(dataPath(path)).toDF(origNames: _*)
    val columns =
      for {
        (c, f) <- data.columns.zip(schema.fields)
        meta = f.metadata
        isVector = meta.contains(IsVectorMetadataKey) && meta.getBoolean(IsVectorMetadataKey)
        column = if (isVector) arrayToVectorUDF(col(c)) else col(c)
      } yield column.as(c, meta)

    data.select(columns: _*)
  }

  /**
   * A dataframe with three quantifiers: forall, exists, and forNone (see below)
   * the rest of extended functionality comes from RichDataset
   * @param ds data frame
   */
  implicit class RichDataFrame(ds: DataFrame) extends RichDataset(ds) {

    /**
     * Given a column name and a predicate, checks that all values satisfy the predicate
     *
     * @param columnName column name
     * @param predicate  predicate, T => Boolean
     * @tparam T column value type
     * @return true iff all values satisfy the predicate
     *
     *         Examples of usage:
     *         {{{
     *            myDF allOf "MyNumericColumn" should beBetween(-1, 1)
     *
     *            myDF someOf "MyStringColumn" should (
     *              (x: String) => (x contains "Country") || (x contains "State")
     *           )
     *         }}}
     */
    def forall[T](columnName: String)(predicate: T => Boolean): Boolean =
      forNone(columnName)((t: T) => !predicate(t))

    /**
     * Given a feature and a predicate, checks that all values satisfy the predicate
     *
     * @param feature feature that describes column
     * @tparam T column value type
     * @return a quantifier that acts on predicate, T => Boolean,
     *         producing true iff all values satisfy the predicate
     */
    def forall[T <: FeatureType: FeatureTypeSparkConverter]
      (feature: FeatureLike[T])(predicate: T#Value => Boolean): Boolean =
      forNone(feature)((t: T#Value) => !predicate(t))

    /**
     * Given a column name and a predicate, checks that some values satisfy the predicate
     *
     * @param columnName column name
     * @param predicate predicate, T => Boolean
     * @tparam T column value type
     * @return true iff at least one value satisfies the predicate
     */
    def exists[T](columnName: String)(predicate: T => Boolean): Boolean =
      !forNone(columnName)(predicate)

    /**
     * Given a feature and a predicate, checks that some values satisfy the predicate
     *
     * @param feature feature that describes column
     * @tparam T column value type
     * @return a quantifier that acts on predicate, T => Boolean,
     *         producing true iff at least one value satisfies the predicate
     */
    def exists[T <: FeatureType: FeatureTypeSparkConverter]
      (feature: FeatureLike[T])(predicate: T#Value => Boolean): Boolean =
      !forNone(feature)(predicate)

    /**
     * Given a column name and a predicate, checks that none of the values satisfy the predicate
     *
     * @param columnName column name
     * @param predicate predicate, T => Boolean
     * @tparam T column value type
     * @return true iff none of the values satisfy the predicate
     */
    def forNone[T](columnName: String)(predicate: T => Boolean): Boolean =
      ds.filter(row => predicate(row.getAs[T](columnName))).isEmpty

    /**
     * Given a feature and a predicate, checks that none of the values satisfy the predicate
     *
     * @param feature feature that describes column
     * @tparam T column value type
     * @return a quantifier that acts on predicate, T => Boolean,
     *         producing true iff none of the values satisfy the predicate
     */
    def forNone[T <: FeatureType : FeatureTypeSparkConverter]
      (feature: FeatureLike[T])(predicate: T#Value => Boolean): Boolean =
      ds.filter(row => predicate(row.getFeatureType(feature).value)).isEmpty
  }

  implicit class RichDataset(val ds: Dataset[_]) {

    /**
     * Will convert data frame with complex feature names and vector types into avro compatible format
     * and save it to the specified location
     *
     * @param path       location to save data
     * @param cleanNames should clean column names from non alphanumeric characters before saving
     * @param options    output options for the underlying data source
     * @param saveMode   Specifies the behavior when data or table already exists.
     *                   Options include:
     *                   - `SaveMode.Overwrite`: overwrite the existing data.
     *                   - `SaveMode.Append`: append the data.
     *                   - `SaveMode.Ignore`: ignore the operation (i.e. no-op).
     *                   - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
     * @param spark      spark session used to save the original schema information with metadata
     */
    def saveAvro(
      path: String,
      cleanNames: Boolean = true,
      options: Map[String, String] = Map.empty,
      saveMode: SaveMode = SaveMode.ErrorIfExists
    )(implicit spark: SparkSession): Unit = {
      val schema = ds.schema
      val columns = ds.columns.map { c =>
        val cSchema = schema(c)
        val isVector = cSchema.dataType.equalsIgnoreNullability(FeatureSparkTypes.OPVector)
        val newMeta = {
          new MetadataBuilder()
            .withMetadata(cSchema.metadata)
            .putString(key = OriginalNameMetaKey, value = c)
            .putBoolean(key = IsVectorMetadataKey, value = isVector)
        }
        // TODO: Make an option to use a custom Avro type (record) to store sparse vectors directly
        val column = if (isVector) vectorToArrayUDF(col(c)) else col(c)
        val newName = if (cleanNames) TextUtils.cleanString(c) else c
        column.as(newName, newMeta.build)
      }
      val cleaned = ds.select(columns: _*)

      spark.sparkContext.parallelize(Seq(cleaned.schema.prettyJson), 1).saveAsTextFile(schemaPath(path))
      cleaned.write.mode(saveMode).options(options).avro(dataPath(path))
    }

    /**
     * Collects features from the dataset.
     *
     * Running collect requires moving all the data into the application's driver process, and
     * doing so on a very large dataset can crash the driver process with OutOfMemoryError.
     *
     * @throws IllegalArgumentException if dataset schema does not match the features
     * @return array of feature values
     */
    def collect[F1 <: FeatureType : FeatureTypeSparkConverter : ClassTag](
      f: FeatureLike[F1]
    ): Array[F1] =
      select(f).collect().map(r => r.getFeatureType(f))

    /**
     * Collects features from the dataset.
     *
     * Running collect requires moving all the data into the application's driver process, and
     * doing so on a very large dataset can crash the driver process with OutOfMemoryError.
     *
     * @throws IllegalArgumentException if dataset schema does not match the features
     * @return array of feature values
     */
    def collect[F1 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F2 <: FeatureType : FeatureTypeSparkConverter : ClassTag](
      f1: FeatureLike[F1], f2: FeatureLike[F2]
    ): Array[(F1, F2)] =
      select(f1, f2).collect().map(r => (r.getFeatureType(f1), r.getFeatureType(f2)))

    /**
     * Collects features from the dataset.
     *
     * Running collect requires moving all the data into the application's driver process, and
     * doing so on a very large dataset can crash the driver process with OutOfMemoryError.
     *
     * @throws IllegalArgumentException if dataset schema does not match the features
     * @return array of feature values
     */
    def collect[F1 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F2 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F3 <: FeatureType : FeatureTypeSparkConverter : ClassTag](
      f1: FeatureLike[F1], f2: FeatureLike[F2], f3: FeatureLike[F3]
    ): Array[(F1, F2, F3)] =
      select(f1, f2, f3).collect().map(r => (r.getFeatureType(f1), r.getFeatureType(f2), r.getFeatureType(f3)))

    /**
     * Collects features from the dataset.
     *
     * Running collect requires moving all the data into the application's driver process, and
     * doing so on a very large dataset can crash the driver process with OutOfMemoryError.
     *
     * @throws IllegalArgumentException if dataset schema does not match the features
     * @return array of feature values
     */
    def collect[F1 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F2 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F3 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F4 <: FeatureType : FeatureTypeSparkConverter : ClassTag](
      f1: FeatureLike[F1], f2: FeatureLike[F2], f3: FeatureLike[F3], f4: FeatureLike[F4]
    ): Array[(F1, F2, F3, F4)] =
      select(f1, f2, f3, f4).collect().map(r =>
        (r.getFeatureType(f1), r.getFeatureType(f2), r.getFeatureType(f3), r.getFeatureType(f4)))

    /**
     * Collects features from the dataset.
     *
     * Running collect requires moving all the data into the application's driver process, and
     * doing so on a very large dataset can crash the driver process with OutOfMemoryError.
     *
     * @throws IllegalArgumentException if dataset schema does not match the features
     * @return array of feature values
     */
    def collect[F1 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F2 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F3 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F4 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F5 <: FeatureType : FeatureTypeSparkConverter : ClassTag](
      f1: FeatureLike[F1], f2: FeatureLike[F2], f3: FeatureLike[F3], f4: FeatureLike[F4], f5: FeatureLike[F5]
    ): Array[(F1, F2, F3, F4, F5)] =
      select(f1, f2, f3, f4, f5).collect().map(r =>
        (r.getFeatureType(f1), r.getFeatureType(f2), r.getFeatureType(f3),
          r.getFeatureType(f4), r.getFeatureType(f5)))

    /**
     * Collects features from the dataset and returns the first `n` values.
     *
     * Running collect requires moving all the data into the application's driver process, and
     * doing so on a very large dataset can crash the driver process with OutOfMemoryError.
     *
     * @param n number of values to return
     * @throws IllegalArgumentException if dataset schema does not match the features
     * @return array of the first `n` feature values
     */
    def take[F1 <: FeatureType : FeatureTypeSparkConverter : ClassTag](
      n: Int, f: FeatureLike[F1]
    ): Array[F1] =
      select(f).take(n).map(r => r.getFeatureType(f))

    /**
     * Collects features from the dataset and returns the first `n` values.
     *
     * Running collect requires moving all the data into the application's driver process, and
     * doing so on a very large dataset can crash the driver process with OutOfMemoryError.
     *
     * @param n number of values to return
     * @throws IllegalArgumentException if dataset schema does not match the features
     * @return array of the first `n` feature values
     */
    def take[F1 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F2 <: FeatureType : FeatureTypeSparkConverter : ClassTag](
      n: Int, f1: FeatureLike[F1], f2: FeatureLike[F2]
    ): Array[(F1, F2)] =
      select(f1, f2).take(n).map(r => (r.getFeatureType(f1), r.getFeatureType(f2)))

    /**
     * Collects features from the dataset and returns the first `n` values.
     *
     * Running collect requires moving all the data into the application's driver process, and
     * doing so on a very large dataset can crash the driver process with OutOfMemoryError.
     *
     * @param n number of values to return
     * @throws IllegalArgumentException if dataset schema does not match the features
     * @return array of the first `n` feature values
     */
    def take[F1 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F2 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F3 <: FeatureType : FeatureTypeSparkConverter : ClassTag](
      n: Int, f1: FeatureLike[F1], f2: FeatureLike[F2], f3: FeatureLike[F3]
    ): Array[(F1, F2, F3)] =
      select(f1, f2, f3).take(n).map(r => (r.getFeatureType(f1), r.getFeatureType(f2), r.getFeatureType(f3)))

    /**
     * Collects features from the dataset and returns the first `n` values.
     *
     * Running collect requires moving all the data into the application's driver process, and
     * doing so on a very large dataset can crash the driver process with OutOfMemoryError.
     *
     * @param n number of values to return
     * @throws IllegalArgumentException if dataset schema does not match the features
     * @return array of the first `n` feature values
     */
    def take[F1 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F2 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F3 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F4 <: FeatureType : FeatureTypeSparkConverter : ClassTag](
      n: Int, f1: FeatureLike[F1], f2: FeatureLike[F2],
      f3: FeatureLike[F3], f4: FeatureLike[F4]
    ): Array[(F1, F2, F3, F4)] =
      select(f1, f2, f3, f4).take(n).map(r => (r.getFeatureType(f1), r.getFeatureType(f2),
        r.getFeatureType(f3), r.getFeatureType(f4)))

    /**
     * Collects features from the dataset and returns the first `n` values.
     *
     * Running collect requires moving all the data into the application's driver process, and
     * doing so on a very large dataset can crash the driver process with OutOfMemoryError.
     *
     * @param n number of values to return
     * @throws IllegalArgumentException if dataset schema does not match the features
     * @return array of the first `n` feature values
     */
    def take[F1 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F2 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F3 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F4 <: FeatureType : FeatureTypeSparkConverter : ClassTag,
    F5 <: FeatureType : FeatureTypeSparkConverter : ClassTag](
      n: Int, f1: FeatureLike[F1], f2: FeatureLike[F2],
      f3: FeatureLike[F3], f4: FeatureLike[F4], f5: FeatureLike[F5]
    ): Array[(F1, F2, F3, F4, F5)] =
      select(f1, f2, f3, f4, f5).take(n).map(r => (r.getFeatureType(f1), r.getFeatureType(f2),
        r.getFeatureType(f3), r.getFeatureType(f4), r.getFeatureType(f5)))

    /**
     * Selects features from the dataset.
     *
     * @param features features to select
     * @throws IllegalArgumentException if dataset schema does not match the features
     * @return a dataset containing the selected features
     */
    def select(features: OPFeature*): DataFrame = {
      requireValidSchema(ds.schema, features)
      ds.select(features.map(FeatureSparkTypes.toColumn): _*)
    }

    /**
     * Returns metadata map for features
     *
     * @param features features to get metadata for
     * @throws IllegalArgumentException if dataset schema does not match the features
     * @return metadata map for features
     */
    def metadata(features: OPFeature*): Map[OPFeature, Metadata] = {
      val schema = ds.schema
      requireValidSchema(schema, features)

      val fields = schema.fields.map(f => f.name -> f).toMap

      features.foldLeft(Map.empty[OPFeature, Metadata])((acc, feature) =>
        fields.get(feature.name).map(field => acc + (feature -> field.metadata)).getOrElse(acc)
      )
    }

    /**
     * Check if dataset is empty
     *
     * @return true if dataset is empty, false otherwise
     */
    def isEmpty: Boolean = ds.head(1).isEmpty

    /**
     * Validate dataset schema against the specified features
     *
     * @param schema   dataset schema
     * @param features features to validate against
     * @throws IllegalArgumentException if dataset schema does not match the features
     */
    private def requireValidSchema(schema: StructType, features: Seq[OPFeature]): Unit = {
      val validationResults = FeatureSparkTypes.validateSchema(schema, features)
      require(validationResults.isEmpty,
        "Dataset schema does not match the features. Errors: " + validationResults.mkString("'", "','", "'")
      )
    }

  }

}