All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.amazon.deequ.analyzers.KLLSketch.scala Maven / Gradle / Ivy

/**
 * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 * use this file except in compliance with the License. A copy of the License
 * is located at
 *
 *     http://aws.amazon.com/apache2.0/
 *
 * or in the "license" file accompanying this file. This file is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing
 * permissions and limitations under the License.
 *
 */

package com.amazon.deequ.analyzers

import java.nio.ByteBuffer

import scala.collection.mutable.ListBuffer
import scala.util.{Try, Failure}

import com.amazon.deequ.analyzers.catalyst.KLLSketchSerializer
import com.amazon.deequ.analyzers.runners.IllegalAnalyzerParameterException
import com.amazon.deequ.metrics.{BucketDistribution, BucketValue, KLLMetric}
import com.amazon.deequ.analyzers.Analyzers._
import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
import com.amazon.deequ.analyzers.runners.MetricCalculationException

import org.apache.spark.sql.DeequFunctions.stateful_kll
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Column, Row}
import org.apache.spark.sql.functions.col

/**
 * State definition for KLL Sketches.
 * @param qSketch input KLL Sketch object
 * @param globalMax global maximum of the samples represented in KLL Sketch Object
 * @param globalMin global minimum of the samples represented in KLL Sketch Object
 */
case class KLLState(
    qSketch: QuantileNonSample[Double],
    globalMax: Double,
    globalMin: Double)
  extends State[KLLState] {

  /** Add up states by merging sketches */
  override def sum(other: KLLState): KLLState = {
    val mergedSketch = qSketch.merge(other.qSketch)
    KLLState(mergedSketch,
      Math.max(globalMax, other.globalMax),
      Math.min(globalMin, other.globalMin))
  }
}

object KLLState{

  /**
   * Reconstruct the state from the serialized byte arrays.
   * @param bytes the serialized byte arrays of the state
   * @return the state for KLL Sketches
   */
  def fromBytes(bytes: Array[Byte]): KLLState = {
    val buffer = ByteBuffer.wrap(bytes)
    val min = buffer.getDouble
    val max = buffer.getDouble
    val kllBuffer = new Array[Byte](buffer.remaining())
    buffer.get(kllBuffer)
    val kllSketch = KLLSketchSerializer.serializer.deserialize(kllBuffer)
    KLLState(kllSketch, max, min)
  }

}

/**
 * Parameter definition for KLL Sketches.
 * @param sketchSize size of kll sketch
 * @param shrinkingFactor  shrinking factor of kll sketch
 * @param numberOfBuckets  number of buckets
 */
case class KLLParameters(sketchSize: Int, shrinkingFactor: Double, numberOfBuckets: Int)

/**
 * The KLL Sketch analyzer.
 * @param column the column to run the analyzer
 * //@param where constraint expression on the column
 * @param kllParameters parameters of KLL Sketch
 */
case class KLLSketch(
    column: String,
//    where: Option[String] = None,
    kllParameters: Option[KLLParameters] = None)
  extends ScanShareableAnalyzer[KLLState, KLLMetric] {

  var sketchSize: Int = KLLSketch.DEFAULT_SKETCH_SIZE
  var shrinkingFactor: Double = KLLSketch.DEFAULT_SHRINKING_FACTOR
  var numberOfBuckets: Int = KLLSketch.MAXIMUM_ALLOWED_DETAIL_BINS
  if (kllParameters.isDefined) {
    sketchSize = kllParameters.get.sketchSize
    shrinkingFactor = kllParameters.get.shrinkingFactor
    numberOfBuckets = kllParameters.get.numberOfBuckets
  }

  private[this] val PARAM_CHECK: StructType => Unit = { _ =>
    if (numberOfBuckets > KLLSketch.MAXIMUM_ALLOWED_DETAIL_BINS) {
      throw new IllegalAnalyzerParameterException(
        s"Cannot return KLL Sketch related values for more " +
        s"than ${KLLSketch.MAXIMUM_ALLOWED_DETAIL_BINS} values")
    }
  }

  override def aggregationFunctions(): Seq[Column] = {
    // stateful_kll(conditionalSelection(column, where), sketchSize, shrinkingFactor) :: Nil
    stateful_kll(col(column), sketchSize, shrinkingFactor) :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): Option[KLLState] = {
    ifNoNullsIn(result, offset) { _ =>
      KLLState.fromBytes(result.getAs[Array[Byte]](offset))
    }

  }

  override def computeMetricFrom(state: Option[KLLState]): KLLMetric = {
    state match {

      case Some(theState) =>
        val value: Try[BucketDistribution] = Try {

          val finalSketch = theState.qSketch
          val start = theState.globalMin
          val end = theState.globalMax

          var bucketsList = new ListBuffer[BucketValue]()
          for (i <- 0 until numberOfBuckets) {
            val lowBound = start + (end - start) * i / numberOfBuckets.toDouble
            val highBound = start + (end - start) * (i + 1) / numberOfBuckets.toDouble
            if (i == numberOfBuckets - 1) {
              bucketsList += BucketValue(lowBound, highBound,
                finalSketch.getRank(highBound) - finalSketch.getRankExclusive(lowBound))
            } else {
              bucketsList += BucketValue(lowBound, highBound,
                finalSketch.getRankExclusive(highBound) - finalSketch.getRankExclusive(lowBound))
            }
          }

          val parameters = List[Double](finalSketch.shrinkingFactor,
            finalSketch.sketchSize.toDouble)
          val data = finalSketch.getCompactorItems
          BucketDistribution(bucketsList.toList, parameters, data)
        }

        KLLMetric(column, value)

      case None =>
        KLLMetric(column, Failure(Analyzers.emptyStateException(this)))
    }

  }

  override def toFailureMetric(exception: Exception): KLLMetric = {
    KLLMetric(column, Failure(MetricCalculationException.wrapIfNecessary(exception)))
  }


  override def preconditions(): Seq[StructType => Unit] = {
    PARAM_CHECK :: hasColumn(column) :: isNumeric(column) :: Nil
  }
}

object KLLSketch {
  val DEFAULT_SKETCH_SIZE = 2048
  val DEFAULT_SHRINKING_FACTOR = 0.64
  val MAXIMUM_ALLOWED_DETAIL_BINS = 100
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy