io.prophecy.libs.python.UDFUtils.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of prophecy-libs_2.12 Show documentation
Prophecy Spark Libraries
There is a newer version: 6.3.0-3.3.0
package io.prophecy.libs.python

import org.apache.spark.sql._
import org.apache.spark.sql.expressions.UserDefinedFunction

object UDFUtils {

  /**
    * @param lookupName
    * @return Boolean Column
    */

  /**
    * By default returns only the first matching record
    */
  def lookup(lookupName: String, cols: List[Column]): Column =
    io.prophecy.libs.lookup(lookupName, cols: _*)

  /**
    * Returns the last matching record
    * @param lookupName
    * @param cols
    * @return
    */
  def lookup_last(lookupName: String, cols: List[Column]): Column =
    io.prophecy.libs.lookup_last(lookupName, cols: _*)

  /**
    * @param lookupName
    * @return Boolean Column
    */
  def lookup_match(lookupName: String, cols: List[Column]): Column =
    io.prophecy.libs.lookup_match(lookupName, cols: _*)

  def lookup_count(lookupName: String, cols: List[Column]): Column =
    io.prophecy.libs.lookup_count(lookupName, cols: _*)

  def lookup_row(lookupName: String, cols: List[Column]): Column =
    io.prophecy.libs.lookup_row(lookupName, cols: _*)

  def lookup_row_reverse(lookupName: String, cols: List[Column]): Column =
    io.prophecy.libs.lookup_row_reverse(lookupName, cols: _*)

  def lookup_nth(lookupName: String, cols: List[Column]): Column =
    io.prophecy.libs.lookup_nth(lookupName, cols: _*)

  /**
    * Function registers 4 different UDFs with spark registry. UDF for lookup_match, lookup_count,
    * lookup_row and lookup functions are registered. This function stores the data of input dataframe in
    * a broadcast variable, then uses this broadcast variable in different lookup functions.
    *
    * lookup : This function returns the first matching row for given input keys
    * lookup_count : This function returns the count of all matching rows for given input keys.
    * lookup_match : This function returns 0 if there is no matching row and 1 for some matching rows for given input keys.
    * lookup_row : This function returns all the matching rows for given input keys.
    *
    * This function registers for upto 10 matching keys as input to these lookup functions.
    *
    * @param name UDF Name
    * @param df input dataframe
    * @param spark spark session
    * @param keyCols columns to be used as keys in lookup functions.
    * @param rowCols schema of entire row which will be stored for each matching key.
    * @return registered UDF definitions for lookup functions. These UDF functions returns different results depending
    *         on the lookup function.
    */
  def createLookup(
    name:    String,
    df:      DataFrame,
    spark:   SparkSession,
    keyCols: List[String],
    rowCols: List[String]
  ): UserDefinedFunction =
    io.prophecy.libs.createLookup(name, df, spark, keyCols, rowCols: _*)

  /**
    * Method to create UDF which looks for passed input double in input dataframe. This function first
    * loads the data of dataframe in broadcast variable and then defines a UDF which looks for input double
    * value in the data stored in broadcast variable. If input double lies between passed col1 and col2 values
    * then it adds corresponding row in the returned result. If value of input double doesn't lie between col1 and
    * col2 then it simply returns null for current row in result.
    *
    * @param name created UDF name
    * @param df input dataframe
    * @param spark spark session
    * @param minColumn column whose value to be considered as minimum in comparison.
    * @param maxColumn column whose value to be considered as maximum in comparison.
    * @param valueColumns remaining column names to be part of result.
    * @return registers UDF which in turn returns rows corresponding to each row in dataframe on which range UDF is called.
    */
  def createRangeLookup(
    name:         String,
    df:           DataFrame,
    spark:        SparkSession,
    minColumn:    String,
    maxColumn:    String,
    valueColumns: List[String]
  ): UserDefinedFunction =
    io.prophecy.libs.createRangeLookup(name, df, spark, minColumn, maxColumn, valueColumns: _*)
}