All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.sql.streaming.DataStreamWriter.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.streaming

import java.util.Locale
import java.util.concurrent.TimeoutException

import scala.jdk.CollectionConverters._

import org.apache.hadoop.fs.Path

import org.apache.spark.annotation.Evolving
import org.apache.spark.api.java.function.VoidFunction2
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.analysis.UnresolvedIdentifier
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.plans.logical.{ColumnDefinition, CreateTable, OptionList, UnresolvedTableSpec}
import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
import org.apache.spark.sql.catalyst.types.DataTypeUtils
import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.sql.connector.catalog.{Identifier, SupportsWrite, Table, TableCatalog, TableProvider, V1Table, V2TableWithV1Fallback}
import org.apache.spark.sql.connector.catalog.TableCapability._
import org.apache.spark.sql.connector.expressions.{ClusterByTransform, FieldReference}
import org.apache.spark.sql.errors.QueryCompilationErrors
import org.apache.spark.sql.execution.command.DDLUtils
import org.apache.spark.sql.execution.datasources.{DataSource, DataSourceUtils}
import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Utils, FileDataSourceV2}
import org.apache.spark.sql.execution.datasources.v2.python.PythonDataSourceV2
import org.apache.spark.sql.execution.streaming._
import org.apache.spark.sql.execution.streaming.sources._
import org.apache.spark.sql.util.CaseInsensitiveStringMap
import org.apache.spark.util.ArrayImplicits._
import org.apache.spark.util.Utils

/**
 * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems,
 * key-value stores, etc). Use `Dataset.writeStream` to access this.
 *
 * @since 2.0.0
 */
@Evolving
final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
  import DataStreamWriter._

  private val df = ds.toDF()

  /**
   * Specifies how data of a streaming DataFrame/Dataset is written to a streaming sink.
   * 
    *
  • `OutputMode.Append()`: only the new rows in the streaming DataFrame/Dataset will be * written to the sink.
  • *
  • `OutputMode.Complete()`: all the rows in the streaming DataFrame/Dataset will be written * to the sink every time there are some updates.
  • *
  • `OutputMode.Update()`: only the rows that were updated in the streaming * DataFrame/Dataset will be written to the sink every time there are some updates. * If the query doesn't contain aggregations, it will be equivalent to * `OutputMode.Append()` mode.
  • *
* * @since 2.0.0 */ def outputMode(outputMode: OutputMode): DataStreamWriter[T] = { this.outputMode = outputMode this } /** * Specifies how data of a streaming DataFrame/Dataset is written to a streaming sink. *
    *
  • `append`: only the new rows in the streaming DataFrame/Dataset will be written to * the sink.
  • *
  • `complete`: all the rows in the streaming DataFrame/Dataset will be written to the sink * every time there are some updates.
  • *
  • `update`: only the rows that were updated in the streaming DataFrame/Dataset will * be written to the sink every time there are some updates. If the query doesn't * contain aggregations, it will be equivalent to `append` mode.
  • *
* * @since 2.0.0 */ def outputMode(outputMode: String): DataStreamWriter[T] = { this.outputMode = InternalOutputModes(outputMode) this } /** * Set the trigger for the stream query. The default value is `ProcessingTime(0)` and it will run * the query as fast as possible. * * Scala Example: * {{{ * df.writeStream.trigger(ProcessingTime("10 seconds")) * * import scala.concurrent.duration._ * df.writeStream.trigger(ProcessingTime(10.seconds)) * }}} * * Java Example: * {{{ * df.writeStream().trigger(ProcessingTime.create("10 seconds")) * * import java.util.concurrent.TimeUnit * df.writeStream().trigger(ProcessingTime.create(10, TimeUnit.SECONDS)) * }}} * * @since 2.0.0 */ def trigger(trigger: Trigger): DataStreamWriter[T] = { this.trigger = trigger this } /** * Specifies the name of the [[StreamingQuery]] that can be started with `start()`. * This name must be unique among all the currently active queries in the associated SQLContext. * * @since 2.0.0 */ def queryName(queryName: String): DataStreamWriter[T] = { this.extraOptions += ("queryName" -> queryName) this } /** * Specifies the underlying output data source. * * @since 2.0.0 */ def format(source: String): DataStreamWriter[T] = { this.source = source this } /** * Partitions the output by the given columns on the file system. If specified, the output is * laid out on the file system similar to Hive's partitioning scheme. As an example, when we * partition a dataset by year and then month, the directory layout would look like: * *
    *
  • year=2016/month=01/
  • *
  • year=2016/month=02/
  • *
* * Partitioning is one of the most widely used techniques to optimize physical data layout. * It provides a coarse-grained index for skipping unnecessary data reads when queries have * predicates on the partitioned columns. In order for partitioning to work well, the number * of distinct values in each column should typically be less than tens of thousands. * * @since 2.0.0 */ @scala.annotation.varargs def partitionBy(colNames: String*): DataStreamWriter[T] = { this.partitioningColumns = Option(colNames) validatePartitioningAndClustering() this } /** * Clusters the output by the given columns. If specified, the output is laid out such that * records with similar values on the clustering column are grouped together in the same file. * * Clustering improves query efficiency by allowing queries with predicates on the clustering * columns to skip unnecessary data. Unlike partitioning, clustering can be used on very high * cardinality columns. * * @since 4.0.0 */ @scala.annotation.varargs def clusterBy(colNames: String*): DataStreamWriter[T] = { this.clusteringColumns = Option(colNames) validatePartitioningAndClustering() this } /** * Adds an output option for the underlying data source. * * @since 2.0.0 */ def option(key: String, value: String): DataStreamWriter[T] = { this.extraOptions += (key -> value) this } /** * Adds an output option for the underlying data source. * * @since 2.0.0 */ def option(key: String, value: Boolean): DataStreamWriter[T] = option(key, value.toString) /** * Adds an output option for the underlying data source. * * @since 2.0.0 */ def option(key: String, value: Long): DataStreamWriter[T] = option(key, value.toString) /** * Adds an output option for the underlying data source. * * @since 2.0.0 */ def option(key: String, value: Double): DataStreamWriter[T] = option(key, value.toString) /** * (Scala-specific) Adds output options for the underlying data source. * * @since 2.0.0 */ def options(options: scala.collection.Map[String, String]): DataStreamWriter[T] = { this.extraOptions ++= options this } /** * Adds output options for the underlying data source. * * @since 2.0.0 */ def options(options: java.util.Map[String, String]): DataStreamWriter[T] = { this.options(options.asScala) this } /** * Starts the execution of the streaming query, which will continually output results to the given * path as new data arrives. The returned [[StreamingQuery]] object can be used to interact with * the stream. * * @since 2.0.0 */ def start(path: String): StreamingQuery = { if (!df.sparkSession.sessionState.conf.legacyPathOptionBehavior && extraOptions.contains("path")) { throw QueryCompilationErrors.setPathOptionAndCallWithPathParameterError("start") } startInternal(Some(path)) } /** * Starts the execution of the streaming query, which will continually output results to the given * path as new data arrives. The returned [[StreamingQuery]] object can be used to interact with * the stream. Throws a `TimeoutException` if the following conditions are met: * - Another run of the same streaming query, that is a streaming query * sharing the same checkpoint location, is already active on the same * Spark Driver * - The SQL configuration `spark.sql.streaming.stopActiveRunOnRestart` * is enabled * - The active run cannot be stopped within the timeout controlled by * the SQL configuration `spark.sql.streaming.stopTimeout` * * @since 2.0.0 */ @throws[TimeoutException] def start(): StreamingQuery = startInternal(None) /** * Starts the execution of the streaming query, which will continually output results to the given * table as new data arrives. The returned [[StreamingQuery]] object can be used to interact with * the stream. * * For v1 table, partitioning columns provided by `partitionBy` will be respected no matter the * table exists or not. A new table will be created if the table not exists. * * For v2 table, `partitionBy` will be ignored if the table already exists. `partitionBy` will be * respected only if the v2 table does not exist. Besides, the v2 table created by this API lacks * some functionalities (e.g., customized properties, options, and serde info). If you need them, * please create the v2 table manually before the execution to avoid creating a table with * incomplete information. * * @since 3.1.0 */ @Evolving @throws[TimeoutException] def toTable(tableName: String): StreamingQuery = { this.tableName = tableName import df.sparkSession.sessionState.analyzer.CatalogAndIdentifier import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ val parser = df.sparkSession.sessionState.sqlParser val originalMultipartIdentifier = parser.parseMultipartIdentifier(tableName) val CatalogAndIdentifier(catalog, identifier) = originalMultipartIdentifier // Currently we don't create a logical streaming writer node in logical plan, so cannot rely // on analyzer to resolve it. Directly lookup only for temp view to provide clearer message. // TODO (SPARK-27484): we should add the writing node before the plan is analyzed. if (df.sparkSession.sessionState.catalog.isTempView(originalMultipartIdentifier)) { throw QueryCompilationErrors.tempViewNotSupportStreamingWriteError(tableName) } if (!catalog.asTableCatalog.tableExists(identifier)) { import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ val properties = normalizedClusteringCols.map { cols => Map( DataSourceUtils.CLUSTERING_COLUMNS_KEY -> DataSourceUtils.encodePartitioningColumns(cols)) }.getOrElse(Map.empty) val partitioningOrClusteringTransform = normalizedClusteringCols.map { colNames => Array(ClusterByTransform(colNames.map(col => FieldReference(col)))).toImmutableArraySeq }.getOrElse(partitioningColumns.getOrElse(Nil).asTransforms.toImmutableArraySeq) /** * Note, currently the new table creation by this API doesn't fully cover the V2 table. * TODO (SPARK-33638): Full support of v2 table creation */ val tableSpec = UnresolvedTableSpec( properties, Some(source), OptionList(Seq.empty), extraOptions.get("path"), None, None, false) val cmd = CreateTable( UnresolvedIdentifier(originalMultipartIdentifier), df.schema.asNullable.map(ColumnDefinition.fromV1Column(_, parser)), partitioningOrClusteringTransform, tableSpec, ignoreIfExists = false) Dataset.ofRows(df.sparkSession, cmd) } val tableInstance = catalog.asTableCatalog.loadTable(identifier) def writeToV1Table(table: CatalogTable): StreamingQuery = { if (table.tableType == CatalogTableType.VIEW) { throw QueryCompilationErrors.streamingIntoViewNotSupportedError(tableName) } require(table.provider.isDefined) if (source != table.provider.get) { throw QueryCompilationErrors.inputSourceDiffersFromDataSourceProviderError( source, tableName, table) } format(table.provider.get).startInternal( Some(new Path(table.location).toString), catalogTable = Some(table)) } import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ tableInstance match { case t: SupportsWrite if t.supports(STREAMING_WRITE) => startQuery(t, extraOptions, catalogAndIdent = Some(catalog.asTableCatalog, identifier)) case t: V2TableWithV1Fallback => writeToV1Table(t.v1Table) case t: V1Table => writeToV1Table(t.v1Table) case t => throw QueryCompilationErrors.tableNotSupportStreamingWriteError(tableName, t) } } private def startInternal( path: Option[String], catalogTable: Option[CatalogTable] = None): StreamingQuery = { if (source.toLowerCase(Locale.ROOT) == DDLUtils.HIVE_PROVIDER) { throw QueryCompilationErrors.cannotOperateOnHiveDataSourceFilesError("write") } if (source == SOURCE_NAME_MEMORY) { assertNotPartitioned(SOURCE_NAME_MEMORY) if (extraOptions.get("queryName").isEmpty) { throw QueryCompilationErrors.queryNameNotSpecifiedForMemorySinkError() } val sink = new MemorySink() val resultDf = Dataset.ofRows(df.sparkSession, MemoryPlan(sink, DataTypeUtils.toAttributes(df.schema))) val recoverFromCheckpoint = outputMode == OutputMode.Complete() val query = startQuery(sink, extraOptions, recoverFromCheckpoint = recoverFromCheckpoint, catalogTable = catalogTable) resultDf.createOrReplaceTempView(query.name) query } else if (source == SOURCE_NAME_FOREACH) { assertNotPartitioned(SOURCE_NAME_FOREACH) val sink = ForeachWriterTable[Any](foreachWriter, foreachWriterEncoder) startQuery(sink, extraOptions, catalogTable = catalogTable) } else if (source == SOURCE_NAME_FOREACH_BATCH) { assertNotPartitioned(SOURCE_NAME_FOREACH_BATCH) if (trigger.isInstanceOf[ContinuousTrigger]) { throw QueryCompilationErrors.sourceNotSupportedWithContinuousTriggerError(source) } val sink = new ForeachBatchSink[T](foreachBatchWriter, ds.exprEnc) startQuery(sink, extraOptions, catalogTable = catalogTable) } else { val cls = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf) val disabledSources = Utils.stringToSeq(df.sparkSession.sessionState.conf.disabledV2StreamingWriters) val useV1Source = disabledSources.contains(cls.getCanonicalName) || // file source v2 does not support streaming yet. classOf[FileDataSourceV2].isAssignableFrom(cls) val optionsWithPath = if (path.isEmpty) { extraOptions } else { extraOptions + ("path" -> path.get) } val sink = if (classOf[TableProvider].isAssignableFrom(cls) && !useV1Source) { val provider = cls.getConstructor().newInstance().asInstanceOf[TableProvider] val sessionOptions = DataSourceV2Utils.extractSessionConfigs( source = provider, conf = df.sparkSession.sessionState.conf) val finalOptions = sessionOptions.filter { case (k, _) => !optionsWithPath.contains(k) } ++ optionsWithPath.originalMap val dsOptions = new CaseInsensitiveStringMap(finalOptions.asJava) // If the source accepts external table metadata, here we pass the schema of input query // to `getTable`. This is for avoiding schema inference, which can be very expensive. // If the query schema is not compatible with the existing data, the behavior is undefined. val outputSchema = if (provider.supportsExternalMetadata()) { Some(df.schema) } else { None } provider match { case p: PythonDataSourceV2 => p.setShortName(source) case _ => } val table = DataSourceV2Utils.getTableFromProvider( provider, dsOptions, userSpecifiedSchema = outputSchema) import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ table match { case table: SupportsWrite if table.supports(STREAMING_WRITE) => table case _ => createV1Sink(optionsWithPath) } } else { createV1Sink(optionsWithPath) } startQuery(sink, optionsWithPath, catalogTable = catalogTable) } } private def startQuery( sink: Table, newOptions: CaseInsensitiveMap[String], recoverFromCheckpoint: Boolean = true, catalogAndIdent: Option[(TableCatalog, Identifier)] = None, catalogTable: Option[CatalogTable] = None): StreamingQuery = { val useTempCheckpointLocation = SOURCES_ALLOW_ONE_TIME_QUERY.contains(source) df.sparkSession.sessionState.streamingQueryManager.startQuery( newOptions.get("queryName"), newOptions.get("checkpointLocation"), df, newOptions.originalMap, sink, outputMode, useTempCheckpointLocation = useTempCheckpointLocation, recoverFromCheckpointLocation = recoverFromCheckpoint, trigger = trigger, catalogAndIdent = catalogAndIdent, catalogTable = catalogTable) } private def createV1Sink(optionsWithPath: CaseInsensitiveMap[String]): Sink = { // Do not allow the user to specify clustering columns in the options. Ignoring this option is // consistent with the behavior of DataFrameWriter on non Path-based tables and with the // behavior of DataStreamWriter on partitioning columns specified in options. val optionsWithoutClusteringKey = optionsWithPath.originalMap - DataSourceUtils.CLUSTERING_COLUMNS_KEY val optionsWithClusteringColumns = normalizedClusteringCols match { case Some(cols) => optionsWithoutClusteringKey + ( DataSourceUtils.CLUSTERING_COLUMNS_KEY -> DataSourceUtils.encodePartitioningColumns(cols)) case None => optionsWithoutClusteringKey } val ds = DataSource( df.sparkSession, className = source, options = optionsWithClusteringColumns, partitionColumns = normalizedParCols.getOrElse(Nil)) ds.createSink(outputMode) } /** * Sets the output of the streaming query to be processed using the provided writer object. * object. See [[org.apache.spark.sql.ForeachWriter]] for more details on the lifecycle and * semantics. * @since 2.0.0 */ def foreach(writer: ForeachWriter[T]): DataStreamWriter[T] = { foreachImplementation(writer.asInstanceOf[ForeachWriter[Any]]) } private[sql] def foreachImplementation(writer: ForeachWriter[Any], encoder: Option[ExpressionEncoder[Any]] = None): DataStreamWriter[T] = { this.source = SOURCE_NAME_FOREACH this.foreachWriter = if (writer != null) { ds.sparkSession.sparkContext.clean(writer) } else { throw new IllegalArgumentException("foreach writer cannot be null") } encoder.foreach(e => this.foreachWriterEncoder = e) this } /** * :: Experimental :: * * (Scala-specific) Sets the output of the streaming query to be processed using the provided * function. This is supported only in the micro-batch execution modes (that is, when the * trigger is not continuous). In every micro-batch, the provided function will be called in * every micro-batch with (i) the output rows as a Dataset and (ii) the batch identifier. * The batchId can be used to deduplicate and transactionally write the output * (that is, the provided Dataset) to external systems. The output Dataset is guaranteed * to be exactly the same for the same batchId (assuming all operations are deterministic * in the query). * * @since 2.4.0 */ @Evolving def foreachBatch(function: (Dataset[T], Long) => Unit): DataStreamWriter[T] = { this.source = SOURCE_NAME_FOREACH_BATCH if (function == null) throw new IllegalArgumentException("foreachBatch function cannot be null") this.foreachBatchWriter = function this } /** * :: Experimental :: * * (Java-specific) Sets the output of the streaming query to be processed using the provided * function. This is supported only in the micro-batch execution modes (that is, when the * trigger is not continuous). In every micro-batch, the provided function will be called in * every micro-batch with (i) the output rows as a Dataset and (ii) the batch identifier. * The batchId can be used to deduplicate and transactionally write the output * (that is, the provided Dataset) to external systems. The output Dataset is guaranteed * to be exactly the same for the same batchId (assuming all operations are deterministic * in the query). * * @since 2.4.0 */ @Evolving def foreachBatch(function: VoidFunction2[Dataset[T], java.lang.Long]): DataStreamWriter[T] = { foreachBatch((batchDs: Dataset[T], batchId: Long) => function.call(batchDs, batchId)) } private def normalizedParCols: Option[Seq[String]] = partitioningColumns.map { cols => cols.map(normalize(_, "Partition")) } private def normalizedClusteringCols: Option[Seq[String]] = clusteringColumns.map { cols => cols.map(normalize(_, "Clustering")) } /** * The given column name may not be equal to any of the existing column names if we were in * case-insensitive context. Normalize the given column name to the real one so that we don't * need to care about case sensitivity afterwards. */ private def normalize(columnName: String, columnType: String): String = { val validColumnNames = df.logicalPlan.output.map(_.name) validColumnNames.find(df.sparkSession.sessionState.analyzer.resolver(_, columnName)) .getOrElse(throw QueryCompilationErrors.columnNotFoundInExistingColumnsError( columnType, columnName, validColumnNames)) } private def assertNotPartitioned(operation: String): Unit = { if (partitioningColumns.isDefined) { throw QueryCompilationErrors.operationNotSupportPartitioningError(operation) } } // Validate that partitionBy isn't used with clusterBy. private def validatePartitioningAndClustering(): Unit = { if (clusteringColumns.nonEmpty && partitioningColumns.nonEmpty) { throw QueryCompilationErrors.clusterByWithPartitionedBy() } } /////////////////////////////////////////////////////////////////////////////////////// // Builder pattern config options /////////////////////////////////////////////////////////////////////////////////////// private var source: String = df.sparkSession.sessionState.conf.defaultDataSourceName private var tableName: String = null private var outputMode: OutputMode = OutputMode.Append private var trigger: Trigger = Trigger.ProcessingTime(0L) private var extraOptions = CaseInsensitiveMap[String](Map.empty) private var foreachWriter: ForeachWriter[Any] = null private var foreachWriterEncoder: ExpressionEncoder[Any] = ds.exprEnc.asInstanceOf[ExpressionEncoder[Any]] private var foreachBatchWriter: (Dataset[T], Long) => Unit = null private var partitioningColumns: Option[Seq[String]] = None private var clusteringColumns: Option[Seq[String]] = None } object DataStreamWriter { val SOURCE_NAME_MEMORY = "memory" val SOURCE_NAME_FOREACH = "foreach" val SOURCE_NAME_FOREACH_BATCH = "foreachBatch" val SOURCE_NAME_CONSOLE = "console" val SOURCE_NAME_TABLE = "table" val SOURCE_NAME_NOOP = "noop" // these writer sources are also used for one-time query, hence allow temp checkpoint location val SOURCES_ALLOW_ONE_TIME_QUERY = Seq(SOURCE_NAME_MEMORY, SOURCE_NAME_FOREACH, SOURCE_NAME_FOREACH_BATCH, SOURCE_NAME_CONSOLE, SOURCE_NAME_NOOP) }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy