All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sparkutils.quality.impl.id.GenericLongBasedImports.scala Maven / Gradle / Ivy

There is a newer version: 0.1.3.1-RC3
Show newest version
package com.sparkutils.quality.impl.id

import com.sparkutils.quality.impl.hash.{HashFunctionFactory, HashFunctionsExpression, MessageDigestFactory}
import com.sparkutils.quality.impl.rng.RandLongsWithJump
import org.apache.commons.rng.simple.RandomSource
import org.apache.spark.sql.Column
import org.apache.spark.sql.shim.hash.DigestFactory

trait GenericLongBasedImports {
  /**
   * Creates a default randomRNG based on RandomSource.XO_RO_SHI_RO_128_PP
   */
  def rngID(prefix: String): Column =
    new Column(GenericLongBasedIDExpression(model.RandomID,
      RandLongsWithJump(0L, RandomSource.XO_RO_SHI_RO_128_PP), prefix))

  /**
   * Creates a randomRNG ID based on randomSource with a given seed
   */
  def rng_id(prefix: String, randomSource: RandomSource, seed: Long = 0L): Column =
    new Column( GenericLongBasedIDExpression (model.RandomID,
      RandLongsWithJump(seed, randomSource), prefix) )

  /**
   * Creates a hash based ID based on a 128 bit MD5 by default
   * @param prefix
   * @return
   */
  def fieldBasedID(prefix: String, children: Seq[Column], digestImpl: String = "MD5", digestFactory: String => DigestFactory = MessageDigestFactory): Column =
    new Column(GenericLongBasedIDExpression(model.FieldBasedID,
      HashFunctionsExpression(children.map(_.expr), digestImpl, true, digestFactory(digestImpl)), prefix))

  // NB field_based_id is in HashRelatedFunctionImports, same impl and interface but fits the sql name

  /**
   * Creates an id from fields using MessageDigests, in line with SQL naming please use field_based_id
   *
   * @param prefix
   * @param digestImpl
   * @param children
   * @return
   */
  @deprecated(since = "0.1.0", message = "migrate to field_based_id")
  def fieldBasedID(prefix: String, digestImpl: String, children: Column *): Column =
    fieldBasedID(prefix, children, digestImpl)

  /**
   * Creates a hash based ID based on an upstream compatible long generator, in line with sql functions please migrate to provided_id
   * @param prefix
   * @return
   */
  @deprecated(since = "0.1.0", message = "migrate to provided_id")
  def providedID(prefix: String, child: Column): Column =
    provided_id(prefix, child)

  /**
   * Creates a hash based ID based on an upstream compatible long generator
   *
   * @param prefix
   * @return
   */
  def provided_id(prefix: String, child: Column): Column =
    new Column(GenericLongBasedIDExpression(model.ProvidedID, child.expr, prefix))

  /**
   * Murmur3 hash
   * @param prefix
   * @param children
   * @param digestImpl - only Murmur3 currently supported
   * @return
   */
  def hashID(prefix: String, children: Seq[Column], digestImpl: String = "IGNORED"): Column =
    new Column(GenericLongBasedIDExpression(model.FieldBasedID,
      HashFunctionsExpression(children.map(_.expr), digestImpl, true, HashFunctionFactory("IGNORED")), prefix))

  def hashID(prefix: String, digestImpl: String, children: Column*): Column = hashID(prefix, children, digestImpl)

  /**
   * Murmur3 hash
   * @param prefix
   * @param children
   * @return
   */
  def murmur3ID(prefix: String, children: Seq[Column]): Column = hashID(prefix, children, "M3_128")
  def murmur3ID(prefix: String, child1: Column, restOfchildren: Column*): Column = hashID(prefix, child1 +: restOfchildren, "M3_128")

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy