org.apache.spark.sql.delta.sources.DeltaSQLConf.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of wasp-delta-lake_2.12 Show documentation
wasp-delta-lake
The newest version!
/*
 * Copyright (2020) The Delta Lake Project Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.delta.sources

// scalastyle:off import.ordering.noEmptyLine
import java.util.concurrent.TimeUnit

import org.apache.spark.internal.config.ConfigBuilder
import org.apache.spark.sql.internal.SQLConf

/**
  * [[SQLConf]] entries for Delta features.
  */
object DeltaSQLConf {
  def buildConf(key: String): ConfigBuilder       = SQLConf.buildConf(s"spark.databricks.delta.$key")
  def buildStaticConf(key: String): ConfigBuilder =
    SQLConf.buildStaticConf(s"spark.databricks.delta.$key")

  val RESOLVE_TIME_TRAVEL_ON_IDENTIFIER =
    buildConf("timeTravel.resolveOnIdentifier.enabled")
      .internal()
      .doc(
        "When true, we will try to resolve patterns as `@v123` in identifiers as time " +
          "travel nodes."
      )
      .booleanConf
      .createWithDefault(true)

  val DELTA_COMMIT_INFO_ENABLED =
    buildConf("commitInfo.enabled")
      .doc("Whether to log commit information into the Delta log.")
      .booleanConf
      .createWithDefault(true)

  val DELTA_SNAPSHOT_PARTITIONS =
    buildConf("snapshotPartitions")
      .internal()
      .doc("Number of partitions to use when building a Delta Lake snapshot.")
      .intConf
      .checkValue(n => n > 0, "Delta snapshot partition number must be positive.")
      .createOptional

  val DELTA_PARTITION_COLUMN_CHECK_ENABLED =
    buildConf("partitionColumnValidity.enabled")
      .internal()
      .doc(
        "Whether to check whether the partition column names have valid names, just like " +
          "the data columns."
      )
      .booleanConf
      .createWithDefault(true)

  val DELTA_SCHEMA_ON_READ_CHECK_ENABLED =
    buildConf("checkLatestSchemaOnRead")
      .doc(
        "In Delta, we always try to give users the latest version of their data without " +
          "having to call REFRESH TABLE or redefine their DataFrames when used in the context of " +
          "streaming. There is a possibility that the schema of the latest version of the table " +
          "may be incompatible with the schema at the time of DataFrame creation. This flag " +
          "enables a check that ensures that users won't read corrupt data if the source schema " +
          "changes in an incompatible way."
      )
      .booleanConf
      .createWithDefault(true)

  val DELTA_COLLECT_STATS =
    buildConf("stats.collect")
      .internal()
      .doc("When true, statistics are collected while writing files into a Delta table.")
      .booleanConf
      .createWithDefault(true)

  val DELTA_IMPORT_BATCH_SIZE_STATS_COLLECTION =
    buildConf("import.batchSize.statsCollection")
      .internal()
      .doc("The number of files per batch for stats collection during import.")
      .intConf
      .createWithDefault(50000)

  val DELTA_IMPORT_BATCH_SIZE_SCHEMA_INFERENCE =
    buildConf("import.batchSize.schemaInference")
      .internal()
      .doc("The number of files per batch for schema inference during import.")
      .intConf
      .createWithDefault(1000000)

  val DELTA_SAMPLE_ESTIMATOR_ENABLED =
    buildConf("sampling.enabled")
      .internal()
      .doc("Enable sample based estimation.")
      .booleanConf
      .createWithDefault(false)

  val DELTA_STATS_SKIPPING =
    buildConf("stats.skipping")
      .internal()
      .doc("When true, statistics are used for skipping")
      .booleanConf
      .createWithDefault(true)

  val DELTA_LIMIT_PUSHDOWN_ENABLED =
    buildConf("stats.limitPushdown.enabled")
      .internal()
      .doc(
        "If true, use the limit clause and file statistics to prune files before " +
          "they are collected to the driver. "
      )
      .booleanConf
      .createWithDefault(true)

  val DELTA_STATS_SKIPPING_LOCAL_CACHE_MAX_NUM_FILES =
    buildConf("stats.localCache.maxNumFiles")
      .internal()
      .doc(
        "The maximum number of files for a table to be considered a 'delta small table'." +
          "Some metadata operations (such as using data skipping) are optimized for small tables " +
          "using driver local caching and local execution."
      )
      .intConf
      .createWithDefault(2000)

  val DELTA_SNAPSHOT_ISOLATION =
    buildConf("snapshotIsolation.enabled")
      .internal()
      .doc(
        "Controls whether queries on Delta tables are guaranteed to have " +
          "snapshot isolation."
      )
      .booleanConf
      .createWithDefault(true)

  val DELTA_MAX_SNAPSHOT_LINEAGE_LENGTH =
    buildConf("maxSnapshotLineageLength")
      .internal()
      .doc(
        "The max lineage length of a Snapshot before Delta forces to build a Snapshot from " +
          "scratch."
      )
      .intConf
      .checkValue(_ > 0, "maxSnapshotLineageLength must be positive.")
      .createWithDefault(50)

  val DELTA_HISTORY_PAR_SEARCH_THRESHOLD =
    buildConf("history.maxKeysPerList")
      .internal()
      .doc(
        "How many commits to list when performing a parallel search. Currently set to 1000, " +
          "which is the maximum keys returned by S3 per list call. Azure can return 5000, " +
          "therefore we choose 1000."
      )
      .intConf
      .createWithDefault(1000)

  val DELTA_HISTORY_METRICS_ENABLED =
    buildConf("history.metricsEnabled")
      .doc(
        "Enables Metrics reporting in Describe History. CommitInfo will now record the " +
          "Operation Metrics."
      )
      .booleanConf
      .createWithDefault(true)

  val DELTA_VACUUM_RETENTION_CHECK_ENABLED =
    buildConf("retentionDurationCheck.enabled")
      .doc(
        "Adds a check preventing users from running vacuum with a very short retention " +
          "period, which may end up corrupting the Delta Log."
      )
      .booleanConf
      .createWithDefault(true)

  val DELTA_CHECKPOINT_PART_SIZE =
    buildConf("checkpoint.partSize")
      .internal()
      .doc("""The limit at which we will start parallelizing the checkpoint. We will attempt to write
             |maximum of this many actions per checkpoint.
        """.stripMargin)
      .longConf
      .checkValue(_ > 0, "The checkpoint part size needs to be a positive integer.")
      .createWithDefault(5000000)

  val DELTA_SCHEMA_AUTO_MIGRATE =
    buildConf("schema.autoMerge.enabled")
      .doc("If true, enables schema merging on appends and on overwrites.")
      .booleanConf
      .createWithDefault(false)

  val DELTA_STATE_CORRUPTION_IS_FATAL =
    buildConf("state.corruptionIsFatal")
      .internal()
      .doc("""If true, throws a fatal error when the recreated Delta State doesn't
          |match committed checksum file.
        """)
      .booleanConf
      .createWithDefault(true)

  val DELTA_ASYNC_UPDATE_STALENESS_TIME_LIMIT =
    buildConf("stalenessLimit")
      .doc("""Setting a non-zero time limit will allow you to query the last loaded state of the Delta
             |table without blocking on a table update. You can use this configuration to reduce the
             |latency on queries when up-to-date results are not a requirement. Table updates will be
             |scheduled on a separate scheduler pool in a FIFO queue, and will share cluster resources
             |fairly with your query. If a table hasn't updated past this time limit, we will block
             |on a synchronous state update before running the query.
        """.stripMargin)
      .timeConf(TimeUnit.MILLISECONDS)
      .createWithDefault(0L) // Don't let tables go stale

  val DELTA_ALTER_LOCATION_BYPASS_SCHEMA_CHECK =
    buildConf("alterLocation.bypassSchemaCheck")
      .doc(
        "If true, Alter Table Set Location on Delta will go through even if the Delta table " +
          "in the new location has a different schema from the original Delta table."
      )
      .booleanConf
      .createWithDefault(false)

  val DUMMY_FILE_MANAGER_NUM_OF_FILES =
    buildConf("dummyFileManager.numOfFiles")
      .internal()
      .doc("How many dummy files to write in DummyFileManager")
      .intConf
      .checkValue(_ >= 0, "numOfFiles can not be negative.")
      .createWithDefault(3)

  val DUMMY_FILE_MANAGER_PREFIX =
    buildConf("dummyFileManager.prefix")
      .internal()
      .doc("The file prefix to use in DummyFileManager")
      .stringConf
      .createWithDefault(".s3-optimization-")

  val MERGE_MAX_INSERT_COUNT =
    buildConf("merge.maxInsertCount")
      .internal()
      .doc("Max row count of inserts in each MERGE execution.")
      .longConf
      .createWithDefault(10000L)

  val MERGE_INSERT_ONLY_ENABLED =
    buildConf("merge.optimizeInsertOnlyMerge.enabled")
      .internal()
      .doc("""
             |If enabled, merge without any matched clause (i.e., insert-only merge) will be optimized
             |by avoiding rewriting old files and just inserting new files.
        """.stripMargin)
      .booleanConf
      .createWithDefault(true)

  val MERGE_REPARTITION_BEFORE_WRITE =
    buildConf("merge.repartitionBeforeWrite.enabled")
      .internal()
      .doc("""
             |When enabled, merge will repartition the output by the table's partition columns before
             |writing the files.
        """.stripMargin)
      .booleanConf
      .createWithDefault(false)

  val MERGE_MATCHED_ONLY_ENABLED =
    buildConf("merge.optimizeMatchedOnlyMerge.enabled")
      .internal()
      .doc("""If enabled, merge without 'when not matched' clause will be optimized to use a
             |right outer join instead of a full outer join.
        """.stripMargin)
      .booleanConf
      .createWithDefault(true)
}