celadus.spark-jobs.3.0.0.source-code.reference.conf Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-jobs Show documentation
The newest version!
# Copyright 2018 ABSA Group Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Configuration added here is considered the application default and it will be used
# for keys that are not specified in the provided 'application.conf' or system properties.
# Here is the precedence of configuration (top ones have higher precedence):
# 1. System Properties (e.g. passed as '-Dkey=value')
# 2. application.conf (e.g. provided as '-Dconfig.file=...')
# 3. reference.conf

# IMPORTANT. Add all sensitive configuration keys to Constants.ConfigKeysToRedact.

# The REST API URI can specify multiple semi-colon-separated base URIs
# each can have multiple comma-separated hosts, these are used for fault-tolerance
enceladus.rest.uri="http://localhost:8080,host2:9000/rest_api;https://localhost:8080,host2:9000/rest_api"

# Each of the above uris can be tried multiple times for fault-tolerance (optional)
enceladus.rest.retryCount=0

# Currently supported ones are these: [401,403,404]
enceladus.rest.optionallyRetryableExceptions=[]

# The following value can be either `roundrobin` or `fallback`. It specifies in which order the REST API URLs will be used
# - roundrobin - a random URL from the list is used, if it fails the next one is tried, if last is reached start from 0 until all are tried
# - fallback - the first URL is used, and only if it fails the second follows etc.
enceladus.rest.availability.setup="roundrobin"

# URI(s) to Menas (UI of Enceladus)
# can be multiple URIs separated by semi-colon, each can have multiple comma-separated hosts
enceladus.menas.uri="http://localhost:8080,host2:9000/menas;https://localhost:8080,host2:9000/enceladus/menas"

# 'enceladus_record_id' with an id can be added containing either true UUID, always the same IDs (row-hash-based) or the
# column will not be added at all. Allowed values: "uuid", "stableHashId", "none"
enceladus.recordId.generation.strategy="uuid"

# errCol's schema output to be (non)nullable in dataframe - both for Standardization and Conformance, regardless of errCol schema input
# Warning: this works on dataframe level, when writing to/reading from parquet, columns are by default nullable, see:
# https://spark.apache.org/docs/3.1.2/sql-data-sources-parquet.html
enceladus.errCol.nullable=true

standardized.hdfs.path="/tmp/conformance-output/standardized-{0}-{1}-{2}-{3}"

# Pattern to look for mapping table for the specified date
# {0} - Year, {1} - Month, {2} - Day of month
conformance.mappingtable.pattern="reportDate={0}-{1}-{2}"

# Use experimental mapping rule implementation that groups explosions for rules
# operating on the same array
conformance.mapping.rule.experimental.implementation=true

# Specify when to use the broadcasting strategy for mapping rules.
# Can be one of: auto, never, always (warning! use 'always' with caution)
# When set to 'auto' the strategy will be used for mapping tables that have size
# bigger than specified in 'conformance.mapping.rule.max.broadcast.size.mb'
conformance.mapping.rule.broadcast=never

# Maximum size (in MB) of a mapping table to use the efficient broadcasting mapping rule strategy
conformance.mapping.rule.max.broadcast.size.mb=10

# Enable workaround for Catalyst execution plan optimization freeze
conformance.catalyst.workaround=true

# Automatically delete standardized data folder after successful run of a Conformance job
conformance.autoclean.standardized.hdfs.folder=false

# Enforce info file validation
# Can be one of: strict, warning, none
control.info.validation=warning

# properties written into info file carry this prefix:
control.info.dataset.properties.prefix=""

# system-wide time zone
timezone="UTC"

#possible values: plan, dataFrame, sample

partition.strategy="plan"
#Block size in bytes needed for repartition/coalesce, needed for any strategy except for recordCount
#min.processing.partition.size=31457280
#max.processing.partition.size=134217728

#if sample is selected
#partition.sample.size=100

# Control plugins
# Several plugins can be used. In this case the last element of the key needs to be incremented for each plugin.
#standardization.plugin.control.metrics.1=za.co.absa.enceladus.KafkaPluginFactory
#conformance.plugin.control.metrics.1=za.co.absa.enceladus.KafkaPluginFactory

# Postprocessing plugins
# Several plugins can be chained. In this case the last element of the key needs to be incremented for each plugin.
#standardization.plugin.postprocessor.1=za.co.absa.enceladus.KafkaErrColumnsPluginFactory
#conformance.plugin.postprocessor.1=za.co.absa.enceladus.KafkaErrColumnsPluginFactory

# Built-in plugins

# 1. Push Control Info measurements to a Kafka topic
# (uncomment to enable)
#standardization.plugin.control.metrics.1=za.co.absa.enceladus.plugins.builtin.controlinfo.mq.kafka.KafkaInfoPlugin
#conformance.plugin.control.metrics.1=za.co.absa.enceladus.plugins.builtin.controlinfo.mq.kafka.KafkaInfoPlugin

# 2. Push errors to a Kafka topic
# standardization.plugin.postprocessor.1=za.co.absa.enceladus.plugins.builtin.errorsender.mq.kafka.KafkaErrorSenderPlugin
# conformance.plugin.postprocessor.1=za.co.absa.enceladus.plugins.builtin.errorsender.mq.kafka.KafkaErrorSenderPlugin

#kafka.schema.registry.url:"http://127.0.0.1:8081"
#kafka.bootstrap.servers="127.0.0.1:9092"
#kafka.info.metrics.client.id="controlInfo"
#kafka.info.metrics.topic.name="control.info"

#kafka.error.client.id="errorInfo"
#kafka.error.topic.name="dq.errors"

# Optional security settings
#kafka.security.protocol="SASL_SSL"
#kafka.sasl.mechanism="GSSAPI"

# Optional Schema Registry Parameters
#kafka.schema.registry.basic.auth.credentials.source=USER_INFO
#kafka.schema.registry.basic.auth.user.info=user:password

# SSL configuration specifics
#javax.net.ssl.trustStore=/path/to/my-truststore.jks
#javax.net.ssl.trustStorePassword=trustStoreExamplePassword1
#javax.net.ssl.keyStore=/path/to/my-keystore.jks
#javax.net.ssl.keyStorePassword=keyStoreExamplePassword1
#java.security.auth.login.config="/path/to/jaas.config"
# JAAS config should contain your `KafkaClient { ... }` settings - Kerberos-based or other

# Optional - allows original dataframe columns to be dropped
conformance.allowOriginalColumnsMutability=false

# FS permissions for _INFO file, otherwise would be default (default is derived from hadoop conf)
atum.hdfs.info.file.permissions=644