org.apache.spark.sql.Observation.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-sql-api_2.13 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql

import java.util.UUID
import java.util.concurrent.atomic.AtomicBoolean

import scala.concurrent.{Future, Promise}
import scala.concurrent.duration.{Duration, DurationInt}
import scala.jdk.CollectionConverters.MapHasAsJava
import scala.util.Try

import org.apache.spark.util.SparkThreadUtils

/**
 * Helper class to simplify usage of `Dataset.observe(String, Column, Column*)`:
 *
 * {{{
 *   // Observe row count (rows) and highest id (maxid) in the Dataset while writing it
 *   val observation = Observation("my metrics")
 *   val observed_ds = ds.observe(observation, count(lit(1)).as("rows"), max($"id").as("maxid"))
 *   observed_ds.write.parquet("ds.parquet")
 *   val metrics = observation.get
 * }}}
 *
 * This collects the metrics while the first action is executed on the observed dataset.
 * Subsequent actions do not modify the metrics returned by [[get]]. Retrieval of the metric via
 * [[get]] blocks until the first action has finished and metrics become available.
 *
 * This class does not support streaming datasets.
 *
 * @param name
 *   name of the metric
 * @since 3.3.0
 */
class Observation(val name: String) {
  require(name.nonEmpty, "Name must not be empty")

  /**
   * Create an Observation with a random name.
   */
  def this() = this(UUID.randomUUID().toString)

  private val isRegistered = new AtomicBoolean()

  private val promise = Promise[Map[String, Any]]()

  /**
   * Future holding the (yet to be completed) observation.
   */
  val future: Future[Map[String, Any]] = promise.future

  /**
   * (Scala-specific) Get the observed metrics. This waits for the observed dataset to finish its
   * first action. Only the result of the first action is available. Subsequent actions do not
   * modify the result.
   *
   * @return
   *   the observed metrics as a `Map[String, Any]`
   * @throws InterruptedException
   *   interrupted while waiting
   */
  @throws[InterruptedException]
  def get: Map[String, Any] = SparkThreadUtils.awaitResult(future, Duration.Inf)

  /**
   * (Java-specific) Get the observed metrics. This waits for the observed dataset to finish its
   * first action. Only the result of the first action is available. Subsequent actions do not
   * modify the result.
   *
   * @return
   *   the observed metrics as a `java.util.Map[String, Object]`
   * @throws InterruptedException
   *   interrupted while waiting
   */
  @throws[InterruptedException]
  def getAsJava: java.util.Map[String, Any] = get.asJava

  /**
   * Get the observed metrics. This returns the metrics if they are available, otherwise an empty.
   *
   * @return
   *   the observed metrics as a `Map[String, Any]`
   */
  @throws[InterruptedException]
  private[sql] def getOrEmpty: Map[String, Any] = {
    Try(SparkThreadUtils.awaitResult(future, 100.millis)).getOrElse(Map.empty)
  }

  /**
   * Mark this Observation as registered.
   */
  private[sql] def markRegistered(): Unit = {
    if (!isRegistered.compareAndSet(false, true)) {
      throw new IllegalArgumentException("An Observation can be used with a Dataset only once")
    }
  }

  /**
   * Set the observed metrics and notify all waiting threads to resume.
   *
   * @return
   *   `true` if all waiting threads were notified, `false` if otherwise.
   */
  private[sql] def setMetricsAndNotify(metrics: Row): Boolean = {
    val metricsMap = metrics.getValuesMap(metrics.schema.map(_.name))
    promise.trySuccess(metricsMap)
  }
}

/**
 * (Scala-specific) Create instances of Observation via Scala `apply`.
 * @since 3.3.0
 */
object Observation {

  /**
   * Observation constructor for creating an anonymous observation.
   */
  def apply(): Observation = new Observation()

  /**
   * Observation constructor for creating a named observation.
   */
  def apply(name: String): Observation = new Observation(name)
}