All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sparkutils.quality.impl.SparkHelpers.scala Maven / Gradle / Ivy

package com.sparkutils.quality.impl

import frameless.{Injection, NotCatalystNullable, TypedColumn, TypedEncoder, TypedExpressionEncoder}
import com.sparkutils.quality._
import com.sparkutils.quality.impl.util.Serializing
import org.apache.spark.sql.{Encoder, Row}
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, GenericArrayData, MapData}
import org.apache.spark.sql.types.{DataType, IntegerType, LongType, MapType, StringType, StructField, StructType}
import shapeless.ops.hlist.IsHCons
import shapeless.{HList, LabelledGeneric, Lazy}

import scala.reflect.ClassTag



trait IntEncodersImplicits extends Serializable {

  // RuleResultWithProcessor's are lost in serialization by design, they only make sense in a given run

  /**
   * Converts the DQ results to and from Int and RuleResult's.  Probability is processed by the value / PassedInt
   */
  implicit val ruleResultToInt: Injection[RuleResult, Int] = Injection(
    {
      case r: RuleResult => Serializing.ruleResultToInt(r)
    },
    {
      case SoftFailedInt => SoftFailed
      case DisabledRuleInt => DisabledRule
      case FailedInt => Failed
      case PassedInt => Passed
      case a: Int => Probability(a.toDouble / PassedInt)
    })

}

object IntEncoders extends IntEncodersImplicits {
}

trait IdEncodersImplicits extends Serializable {
  implicit val versionedIdNotNullable = new NotCatalystNullable[VersionedId] {}

  // try just id first
  implicit val versionedIdTo: Injection[VersionedId, Long] = Injection(
    {
      case Id(id, version) => ((id.toLong) << 32) | (version & 0xffffffffL)
    },
    {
      case a: Long =>
        PackId.unpack(a)
    })
}

object IdEncoders extends IdEncodersImplicits {

}

trait EncodersImplicits extends Serializable {
  import frameless._
  import IntEncoders._
  import IdEncoders._

  implicit val ruleSuiteResultTypedEnc = TypedEncoder[RuleSuiteResult]

  implicit val ruleSuiteResultExpEnc = TypedExpressionEncoder[RuleSuiteResult]

  implicit val ruleSuiteResultDetailsTypedEnc = TypedEncoder[com.sparkutils.quality.RuleSuiteResultDetails]

  implicit val ruleSuiteResultDetailsExpEnc = TypedExpressionEncoder[com.sparkutils.quality.RuleSuiteResultDetails]

  implicit def generalExpressionsResultTypedEnc[R: TypedEncoder] = TypedEncoder[com.sparkutils.quality.GeneralExpressionsResult[R]]

  implicit def generalExpressionsResultExpEnc[R](implicit ev: TypedEncoder[GeneralExpressionsResult[R]]) = TypedExpressionEncoder[com.sparkutils.quality.GeneralExpressionsResult[R]]

  implicit val generalExpressionResultTypedEnc = TypedEncoder[com.sparkutils.quality.GeneralExpressionResult]

  implicit val generalExpressionResultExpEnc = TypedExpressionEncoder[com.sparkutils.quality.GeneralExpressionResult]

  implicit val generalExpressionsResultNoDDLTypedEnc = TypedEncoder[com.sparkutils.quality.GeneralExpressionsResultNoDDL]

  implicit val generalExpressionsResultNoDDLExpEnc = TypedExpressionEncoder[com.sparkutils.quality.GeneralExpressionsResultNoDDL]

  implicit def ruleEngineResultTypedEnc[T: TypedEncoder, G <: HList, H <: HList](implicit
                                                                                  i0: LabelledGeneric.Aux[RuleEngineResult[T], G],
                                                                                  i1: DropUnitValues.Aux[G, H],
                                                                                  i2: IsHCons[H],
                                                                                  i3: Lazy[RecordEncoderFields[H]],
                                                                                  i4: Lazy[NewInstanceExprs[G]],
                                                                                  i5: ClassTag[RuleEngineResult[T]]
  ): TypedEncoder[RuleEngineResult[T]] = {
    TypedEncoder.usingDerivation[RuleEngineResult[T], G, H]
  }

  implicit def ruleEngineResultExpEnc[T: TypedEncoder]: Encoder[RuleEngineResult[T]] = TypedExpressionEncoder[RuleEngineResult[T]]

  implicit def ruleFolderResultTypedEnc[T: TypedEncoder, G <: HList, H <: HList](implicit
                                                                                 i0: LabelledGeneric.Aux[RuleFolderResult[T], G],
                                                                                 i1: DropUnitValues.Aux[G, H],
                                                                                 i2: IsHCons[H],
                                                                                 i3: Lazy[RecordEncoderFields[H]],
                                                                                 i4: Lazy[NewInstanceExprs[G]],
                                                                                 i5: ClassTag[RuleFolderResult[T]]
                                                                                ): TypedEncoder[RuleFolderResult[T]] = {
    TypedEncoder.usingDerivation[RuleFolderResult[T], G, H]
  }

  implicit def ruleFolderResultExpEnc[T: TypedEncoder]: Encoder[RuleFolderResult[T]] = TypedExpressionEncoder[RuleFolderResult[T]]

}

object Encoders extends EncodersImplicits {

}

object EmptyMap extends MapData {
  override val keyArray: ArrayData = new org.apache.spark.sql.catalyst.util.GenericArrayData(Array.empty)
  override val valueArray: ArrayData = new org.apache.spark.sql.catalyst.util.GenericArrayData(Array.empty)
  override def numElements(): Int = 0

  override def copy(): MapData = this

  override def toString: String = {
    s"keys: $keyArray, values: $valueArray"
  }
}

object MapUtils {
  def toScalaMap(map: MapData, keyType: DataType, valueType: DataType): Map[Any, Any] = {
    val keys = map.keyArray.toObjectArray(keyType)
    val values = map.valueArray.toObjectArray(valueType)
    keys.zip(values).toMap
  }

  /**
   * Assuming MapData entries are stable, as they are array backed, over a dataset.  When the position of 'what' is not in the cachedPositions a full search is performed with the new location being returned in addition.
   *
   * @param mapData the source mapdata to search through.
   * @param cachedPositions already known positions to check for what
   * @param what compared against the keyArray to obtain the position
   * @param getValue obtain the value from the mapData, with any necessary conversions, against the index parameter
   * @tparam T
   * @return (result from getValue or null when not found, either cachedPositions or cachedPositions and a newly found position)
   */
  def getMapEntry[T <: AnyRef](mapData: MapData, cachedPositions: Seq[Int], what: Any)(getValue: Int => T): (T, Seq[Int]) = {
    val keys = mapData.keyArray()
    val n = keys.numElements()
    val acc = (idx: Int) => keys.getLong(idx)

    def search() = {
      var i = 0
      var found = false
      while (i < n && !found) {
        if (acc(i) == what) {
          found = true
        } else {
          i += 1
        }
      }
      if (!found)
        -1
      else
        i
    }

    def withCached() =
      cachedPositions.find { i =>
        if (i < n)
          acc(i) == what
        else
          false
      }

    withCached().map(i => (getValue(i), cachedPositions))
      .getOrElse {
        val r = search()
        if (r == -1)
          (null.asInstanceOf[T], cachedPositions)
        else
          (getValue(r), cachedPositions :+ r)
      }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy