org.apache.spark.sql.api.DataFrameNaFunctions.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-sql-api_2.13 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.spark.sql.api

import scala.jdk.CollectionConverters._

import _root_.java.util

import org.apache.spark.annotation.Stable
import org.apache.spark.sql.Row
import org.apache.spark.util.ArrayImplicits._

/**
 * Functionality for working with missing data in `DataFrame`s.
 *
 * @since 1.3.1
 */
@Stable
abstract class DataFrameNaFunctions[DS[U] <: Dataset[U, DS]] {

  /**
   * Returns a new `DataFrame` that drops rows containing any null or NaN values.
   *
   * @since 1.3.1
   */
  def drop(): DS[Row] = drop("any")

  /**
   * Returns a new `DataFrame` that drops rows containing null or NaN values.
   *
   * If `how` is "any", then drop rows containing any null or NaN values. If `how` is "all", then
   * drop rows only if every column is null or NaN for that row.
   *
   * @since 1.3.1
   */
  def drop(how: String): DS[Row] = drop(toMinNonNulls(how))

  /**
   * Returns a new `DataFrame` that drops rows containing any null or NaN values in the specified
   * columns.
   *
   * @since 1.3.1
   */
  def drop(cols: Array[String]): DS[Row] = drop(cols.toImmutableArraySeq)

  /**
   * (Scala-specific) Returns a new `DataFrame` that drops rows containing any null or NaN values
   * in the specified columns.
   *
   * @since 1.3.1
   */
  def drop(cols: Seq[String]): DS[Row] = drop(cols.size, cols)

  /**
   * Returns a new `DataFrame` that drops rows containing null or NaN values in the specified
   * columns.
   *
   * If `how` is "any", then drop rows containing any null or NaN values in the specified columns.
   * If `how` is "all", then drop rows only if every specified column is null or NaN for that row.
   *
   * @since 1.3.1
   */
  def drop(how: String, cols: Array[String]): DS[Row] = drop(how, cols.toImmutableArraySeq)

  /**
   * (Scala-specific) Returns a new `DataFrame` that drops rows containing null or NaN values in
   * the specified columns.
   *
   * If `how` is "any", then drop rows containing any null or NaN values in the specified columns.
   * If `how` is "all", then drop rows only if every specified column is null or NaN for that row.
   *
   * @since 1.3.1
   */
  def drop(how: String, cols: Seq[String]): DS[Row] = drop(toMinNonNulls(how), cols)

  /**
   * Returns a new `DataFrame` that drops rows containing less than `minNonNulls` non-null and
   * non-NaN values.
   *
   * @since 1.3.1
   */
  def drop(minNonNulls: Int): DS[Row] = drop(Option(minNonNulls))

  /**
   * Returns a new `DataFrame` that drops rows containing less than `minNonNulls` non-null and
   * non-NaN values in the specified columns.
   *
   * @since 1.3.1
   */
  def drop(minNonNulls: Int, cols: Array[String]): DS[Row] =
    drop(minNonNulls, cols.toImmutableArraySeq)

  /**
   * (Scala-specific) Returns a new `DataFrame` that drops rows containing less than `minNonNulls`
   * non-null and non-NaN values in the specified columns.
   *
   * @since 1.3.1
   */
  def drop(minNonNulls: Int, cols: Seq[String]): DS[Row] = drop(Option(minNonNulls), cols)

  private def toMinNonNulls(how: String): Option[Int] = {
    how.toLowerCase(util.Locale.ROOT) match {
      case "any" => None // No-Op. Do nothing.
      case "all" => Some(1)
      case _ => throw new IllegalArgumentException(s"how ($how) must be 'any' or 'all'")
    }
  }

  protected def drop(minNonNulls: Option[Int]): DS[Row]

  protected def drop(minNonNulls: Option[Int], cols: Seq[String]): DS[Row]

  /**
   * Returns a new `DataFrame` that replaces null or NaN values in numeric columns with `value`.
   *
   * @since 2.2.0
   */
  def fill(value: Long): DS[Row]

  /**
   * Returns a new `DataFrame` that replaces null or NaN values in numeric columns with `value`.
   * @since 1.3.1
   */
  def fill(value: Double): DS[Row]

  /**
   * Returns a new `DataFrame` that replaces null values in string columns with `value`.
   *
   * @since 1.3.1
   */
  def fill(value: String): DS[Row]

  /**
   * Returns a new `DataFrame` that replaces null or NaN values in specified numeric columns. If a
   * specified column is not a numeric column, it is ignored.
   *
   * @since 2.2.0
   */
  def fill(value: Long, cols: Array[String]): DS[Row] = fill(value, cols.toImmutableArraySeq)

  /**
   * Returns a new `DataFrame` that replaces null or NaN values in specified numeric columns. If a
   * specified column is not a numeric column, it is ignored.
   *
   * @since 1.3.1
   */
  def fill(value: Double, cols: Array[String]): DS[Row] = fill(value, cols.toImmutableArraySeq)

  /**
   * (Scala-specific) Returns a new `DataFrame` that replaces null or NaN values in specified
   * numeric columns. If a specified column is not a numeric column, it is ignored.
   *
   * @since 2.2.0
   */
  def fill(value: Long, cols: Seq[String]): DS[Row]

  /**
   * (Scala-specific) Returns a new `DataFrame` that replaces null or NaN values in specified
   * numeric columns. If a specified column is not a numeric column, it is ignored.
   *
   * @since 1.3.1
   */
  def fill(value: Double, cols: Seq[String]): DS[Row]

  /**
   * Returns a new `DataFrame` that replaces null values in specified string columns. If a
   * specified column is not a string column, it is ignored.
   *
   * @since 1.3.1
   */
  def fill(value: String, cols: Array[String]): DS[Row] = fill(value, cols.toImmutableArraySeq)

  /**
   * (Scala-specific) Returns a new `DataFrame` that replaces null values in specified string
   * columns. If a specified column is not a string column, it is ignored.
   *
   * @since 1.3.1
   */
  def fill(value: String, cols: Seq[String]): DS[Row]

  /**
   * Returns a new `DataFrame` that replaces null values in boolean columns with `value`.
   *
   * @since 2.3.0
   */
  def fill(value: Boolean): DS[Row]

  /**
   * (Scala-specific) Returns a new `DataFrame` that replaces null values in specified boolean
   * columns. If a specified column is not a boolean column, it is ignored.
   *
   * @since 2.3.0
   */
  def fill(value: Boolean, cols: Seq[String]): DS[Row]

  /**
   * Returns a new `DataFrame` that replaces null values in specified boolean columns. If a
   * specified column is not a boolean column, it is ignored.
   *
   * @since 2.3.0
   */
  def fill(value: Boolean, cols: Array[String]): DS[Row] = fill(value, cols.toImmutableArraySeq)

  /**
   * Returns a new `DataFrame` that replaces null values.
   *
   * The key of the map is the column name, and the value of the map is the replacement value. The
   * value must be of the following type: `Integer`, `Long`, `Float`, `Double`, `String`,
   * `Boolean`. Replacement values are cast to the column data type.
   *
   * For example, the following replaces null values in column "A" with string "unknown", and null
   * values in column "B" with numeric value 1.0.
   * {{{
   *   import com.google.common.collect.ImmutableMap;
   *   df.na.fill(ImmutableMap.of("A", "unknown", "B", 1.0));
   * }}}
   *
   * @since 1.3.1
   */
  def fill(valueMap: util.Map[String, Any]): DS[Row] = fillMap(valueMap.asScala.toSeq)

  /**
   * (Scala-specific) Returns a new `DataFrame` that replaces null values.
   *
   * The key of the map is the column name, and the value of the map is the replacement value. The
   * value must be of the following type: `Int`, `Long`, `Float`, `Double`, `String`, `Boolean`.
   * Replacement values are cast to the column data type.
   *
   * For example, the following replaces null values in column "A" with string "unknown", and null
   * values in column "B" with numeric value 1.0.
   * {{{
   *   df.na.fill(Map(
   *     "A" -> "unknown",
   *     "B" -> 1.0
   *   ))
   * }}}
   *
   * @since 1.3.1
   */
  def fill(valueMap: Map[String, Any]): DS[Row] = fillMap(valueMap.toSeq)

  protected def fillMap(values: Seq[(String, Any)]): DS[Row]

  /**
   * Replaces values matching keys in `replacement` map with the corresponding values.
   *
   * {{{
   *   import com.google.common.collect.ImmutableMap;
   *
   *   // Replaces all occurrences of 1.0 with 2.0 in column "height".
   *   df.na.replace("height", ImmutableMap.of(1.0, 2.0));
   *
   *   // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "name".
   *   df.na.replace("name", ImmutableMap.of("UNKNOWN", "unnamed"));
   *
   *   // Replaces all occurrences of "UNKNOWN" with "unnamed" in all string columns.
   *   df.na.replace("*", ImmutableMap.of("UNKNOWN", "unnamed"));
   * }}}
   *
   * @param col
   *   name of the column to apply the value replacement. If `col` is "*", replacement is applied
   *   on all string, numeric or boolean columns.
   * @param replacement
   *   value replacement map. Key and value of `replacement` map must have the same type, and can
   *   only be doubles, strings or booleans. The map value can have nulls.
   *
   * @since 1.3.1
   */
  def replace[T](col: String, replacement: util.Map[T, T]): DS[Row] = {
    replace[T](col, replacement.asScala.toMap)
  }

  /**
   * Replaces values matching keys in `replacement` map with the corresponding values.
   *
   * {{{
   *   import com.google.common.collect.ImmutableMap;
   *
   *   // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight".
   *   df.na.replace(new String[] {"height", "weight"}, ImmutableMap.of(1.0, 2.0));
   *
   *   // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "firstname" and "lastname".
   *   df.na.replace(new String[] {"firstname", "lastname"}, ImmutableMap.of("UNKNOWN", "unnamed"));
   * }}}
   *
   * @param cols
   *   list of columns to apply the value replacement. If `col` is "*", replacement is applied on
   *   all string, numeric or boolean columns.
   * @param replacement
   *   value replacement map. Key and value of `replacement` map must have the same type, and can
   *   only be doubles, strings or booleans. The map value can have nulls.
   *
   * @since 1.3.1
   */
  def replace[T](cols: Array[String], replacement: util.Map[T, T]): DS[Row] = {
    replace(cols.toImmutableArraySeq, replacement.asScala.toMap)
  }

  /**
   * (Scala-specific) Replaces values matching keys in `replacement` map.
   *
   * {{{
   *   // Replaces all occurrences of 1.0 with 2.0 in column "height".
   *   df.na.replace("height", Map(1.0 -> 2.0));
   *
   *   // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "name".
   *   df.na.replace("name", Map("UNKNOWN" -> "unnamed"));
   *
   *   // Replaces all occurrences of "UNKNOWN" with "unnamed" in all string columns.
   *   df.na.replace("*", Map("UNKNOWN" -> "unnamed"));
   * }}}
   *
   * @param col
   *   name of the column to apply the value replacement. If `col` is "*", replacement is applied
   *   on all string, numeric or boolean columns.
   * @param replacement
   *   value replacement map. Key and value of `replacement` map must have the same type, and can
   *   only be doubles, strings or booleans. The map value can have nulls.
   *
   * @since 1.3.1
   */
  def replace[T](col: String, replacement: Map[T, T]): DS[Row]

  /**
   * (Scala-specific) Replaces values matching keys in `replacement` map.
   *
   * {{{
   *   // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight".
   *   df.na.replace("height" :: "weight" :: Nil, Map(1.0 -> 2.0));
   *
   *   // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "firstname" and "lastname".
   *   df.na.replace("firstname" :: "lastname" :: Nil, Map("UNKNOWN" -> "unnamed"));
   * }}}
   *
   * @param cols
   *   list of columns to apply the value replacement. If `col` is "*", replacement is applied on
   *   all string, numeric or boolean columns.
   * @param replacement
   *   value replacement map. Key and value of `replacement` map must have the same type, and can
   *   only be doubles, strings or booleans. The map value can have nulls.
   *
   * @since 1.3.1
   */
  def replace[T](cols: Seq[String], replacement: Map[T, T]): DS[Row]
}