All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.amazon.deequ.analyzers.DatasetMatchAnalyzer.scala Maven / Gradle / Ivy

/**
 * Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"). You may not
 * use this file except in compliance with the License. A copy of the License
 * is located at
 *
 *     http://aws.amazon.com/apache2.0/
 *
 * or in the "license" file accompanying this file. This file is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing
 * permissions and limitations under the License.
 *
 */

package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers.metricFromFailure
import com.amazon.deequ.comparison.DataSynchronization
import com.amazon.deequ.comparison.DatasetMatchFailed
import com.amazon.deequ.comparison.DatasetMatchSucceeded
import com.amazon.deequ.metrics.DoubleMetric
import com.amazon.deequ.metrics.Entity
import org.apache.spark.sql.DataFrame

import scala.util.Failure
import scala.util.Try


/**
 * An Analyzer for Deequ that performs a dataset match check between two DataFrames.
 * It evaluates the degree of match based on specified column mappings and an assertion function.
 *
 * The analyzer computes a ratio of matched data points to the total data points, represented as a DoubleMetric.
 * Refer to [[com.amazon.deequ.comparison.DataSynchronization.columnMatch]] for dataset match implementation
 *
 * @param dfToCompare The DataFrame to compare with the primary DataFrame that is setup
 *                    during [[com.amazon.deequ.VerificationSuite.onData]] setup.
 * @param columnMappings A map where each key-value pair represents a column in the primary DataFrame
 *                       and its corresponding column in dfToCompare.
 * @param matchColumnMappings A map defining the column correlations between the current DataFrame and otherDf.
 *                            These are the columns which we will check for equality, post joining.
 *                            It's an optional value with defaults to None.
 * @param assertion A function that takes a Double (the match ratio) and returns a Boolean.
 *                  It defines the condition for successful synchronization.
 *
 * Usage:
 * This analyzer is used in Deequ's VerificationSuite based if `doesDatasetMatch` check is defined or could be used
 * manually as well.
 *
 * Example:
 * val analyzer = DatasetMatchAnalyzer(dfToCompare, Map("col1" -> "col2"), _ > 0.8)
 * val verificationResult = VerificationSuite().onData(df).addAnalyzer(analyzer).run()
 *
 * // or could do something like below
 * val verificationResult = VerificationSuite().onData(df).doesDatasetMatch(dfToCompare, Map("col1" -> "col2"),
 *                                                                              _ > 0.8).run()
 *
 *
 * The computeStateFrom method calculates the datasetmatch state by comparing the specified columns of the two
 * DataFrames.
 * The computeMetricFrom method then converts this state into a DoubleMetric representing the match ratio.
 *
 */
case class DatasetMatchAnalyzer(dfToCompare: DataFrame,
                                columnMappings: Map[String, String],
                                assertion: Double => Boolean,
                                matchColumnMappings: Option[Map[String, String]] = None)
  extends Analyzer[DatasetMatchState, DoubleMetric] {

  override def computeStateFrom(data: DataFrame, filterCondition: Option[String] = None): Option[DatasetMatchState] = {

    val result = if (matchColumnMappings.isDefined) {
      DataSynchronization.columnMatch(data, dfToCompare, columnMappings, matchColumnMappings.get, assertion)
    } else {
      DataSynchronization.columnMatch(data, dfToCompare, columnMappings, assertion)
    }

    result match {
      case succeeded: DatasetMatchSucceeded =>
        Some(DatasetMatchState(succeeded.passedCount, succeeded.totalCount))
      case failed: DatasetMatchFailed =>
        Some(DatasetMatchState(failed.passedCount.getOrElse(0), failed.totalCount.getOrElse(0)))
      case _ => None
    }
  }

  override def computeMetricFrom(state: Option[DatasetMatchState]): DoubleMetric = {

    val metric = state match {
      case Some(s) => Try(s.matchedDataCount.toDouble / s.totalDataCount.toDouble)
      case _ => Failure(new IllegalStateException("No state available for DataSynchronizationAnalyzer"))
    }

    DoubleMetric(Entity.Dataset, "DataSynchronization", "", metric, None)
  }

  override private[deequ] def toFailureMetric(failure: Exception) =
    metricFromFailure(failure, "DataSynchronization", "", Entity.Dataset)
}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy