All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.azure.cosmos.spark.ItemsScan.scala Maven / Gradle / Ivy

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
package com.azure.cosmos.spark

import com.azure.cosmos.models.{CosmosParameterizedQuery, SqlParameter, SqlQuerySpec}
import com.azure.cosmos.spark.CosmosPredicates.requireNotNull
import com.azure.cosmos.spark.diagnostics.{DiagnosticsContext, LoggerHelper}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.connector.read.streaming.ReadLimit
import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionReaderFactory, Scan}
import org.apache.spark.sql.types.StructType

import java.util.UUID

private case class ItemsScan(session: SparkSession,
                             schema: StructType,
                             config: Map[String, String],
                             readConfig: CosmosReadConfig,
                             cosmosQuery: CosmosParameterizedQuery,
                             cosmosClientStateHandles: Broadcast[CosmosClientMetadataCachesSnapshots],
                             diagnosticsConfig: DiagnosticsConfig,
                             sparkEnvironmentInfo: String)
  extends Scan
    with Batch {

  requireNotNull(cosmosQuery, "cosmosQuery")

  @transient private lazy val log = LoggerHelper.getLogger(diagnosticsConfig, this.getClass)
  log.logTrace(s"Instantiated ${this.getClass.getSimpleName}")

  private val clientConfiguration = CosmosClientConfiguration.apply(
    config,
    readConfig.forceEventualConsistency,
    CosmosClientConfiguration.getSparkEnvironmentInfo(Some(session))
  )
  private val containerConfig = CosmosContainerConfig.parseCosmosContainerConfig(config)
  private val partitioningConfig = CosmosPartitioningConfig.parseCosmosPartitioningConfig(config)
  private val defaultMinPartitionCount = 1 + (2 * session.sparkContext.defaultParallelism)

  override def description(): String = {
    s"""Cosmos ItemsScan: ${containerConfig.database}.${containerConfig.container}
       | - Cosmos Query: ${toPrettyString(cosmosQuery.toSqlQuerySpec)}""".stripMargin
  }

  private[this] def toPrettyString(query: SqlQuerySpec) = {
    //scalastyle:off magic.number
    val sb = new StringBuilder()
    //scalastyle:on magic.number
    sb.append(query.getQueryText)
    query.getParameters.forEach(
      (p: SqlParameter) => sb
        .append(CosmosConstants.SystemProperties.LineSeparator)
        .append(" > param: ")
        .append(p.getName)
        .append(" = ")
        .append(p.getValue(classOf[Any])))

    sb.toString
  }

  /**
   * Returns the actual schema of this data source scan, which may be different from the physical
   * schema of the underlying storage, as column pruning or other optimizations may happen.
   */
  override def readSchema(): StructType = {
    schema
  }

  override def planInputPartitions(): Array[InputPartition] = {
    val partitionMetadata = CosmosPartitionPlanner.getFilteredPartitionMetadata(
      config,
      clientConfiguration,
      Some(cosmosClientStateHandles),
      containerConfig,
      partitioningConfig,
      false
    )

    val calledFrom = s"ItemsScan($description()).planInputPartitions"
    Loan(
      List[Option[CosmosClientCacheItem]](
        Some(CosmosClientCache.apply(
          clientConfiguration,
          Some(cosmosClientStateHandles.value.cosmosClientMetadataCaches),
          calledFrom
        )),
        ThroughputControlHelper.getThroughputControlClientCacheItem(
          config, calledFrom, Some(cosmosClientStateHandles), sparkEnvironmentInfo)
      ))
      .to(clientCacheItems => {
        val container =
          ThroughputControlHelper.getContainer(
            config,
            containerConfig,
            clientCacheItems(0).get,
            clientCacheItems(1))
        SparkUtils.safeOpenConnectionInitCaches(container, log)

        CosmosPartitionPlanner
          .createInputPartitions(
            partitioningConfig,
            container,
            partitionMetadata,
            defaultMinPartitionCount,
            CosmosPartitionPlanner.DefaultPartitionSizeInMB,
            ReadLimit.allAvailable(),
            false
          )
          .map(_.asInstanceOf[InputPartition])
      })
  }

  override def createReaderFactory(): PartitionReaderFactory = {
    val correlationActivityId = UUID.randomUUID()
    log.logInfo(s"Creating ItemsScan with CorrelationActivityId '${correlationActivityId.toString}' for query '${cosmosQuery.queryText}'")
    ItemsScanPartitionReaderFactory(config,
      schema,
      cosmosQuery,
      DiagnosticsContext(correlationActivityId, cosmosQuery.queryText),
      cosmosClientStateHandles,
      DiagnosticsConfig.parseDiagnosticsConfig(config),
      sparkEnvironmentInfo)
  }

  override def toBatch: Batch = {
    this
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy