amen.pramen-core_2.13.1.9.7.source-code.reference.conf Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pramen-core_2.13 Show documentation
Show all versions of pramen-core_2.13 Show documentation
Batch data pipeline management tool
# Copyright 2022 ABSA Group Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pramen {
ingestion.name = "Unspecified"
environment.name = "DEV"
bookkeeping.enabled = "true"
## This specifies the location to keep locks and bookkeeping. One of the following must be specified.
# Use PostgresSQL for bookkeeping
# bookkeeping.jdbc {
# driver = "org.postgresql.Driver"
# url = "jdbc:postgresql://host:5433/pramen"
# user = "username"
# password = "password"
# }
# Use MongoDB for bookkeeping
# bookkeeping.mongodb.connection.string = "mongodb://127.0.0.1"
# bookkeeping.mongodb.database = "pramen"
# Use Hadoop (HDFS, S3, etc) for bookkeeping
# bookkeeping.location = ""
# Bookkeeping storage format: "delta" or "text"
bookkeeping.hadoop.format = "text"
# Default information column date field used for the metastore. Sourced tables will be partitioned by this field.
information.date.column = "pramen_info_date"
information.date.format = "yyyy-MM-dd"
# You can set this option so that Pramen never writes to partitions older than the specified date
#information.date.start = "2010-01-01"
# Or you can specify the same option in the number of days from the current calendar date.
#information.date.max.days.behind = 30
# If non-zero, specifies how many tasks can be ran in parallel
parallel.tasks = 1
# Enables Hive (requires Hive JARs to be in the classpath)
enable.hive = true
# The API to use to query Hive. Valid values are: "sql", "spark_catalog"
hive.api = "sql"
# When possible prefer ADD PARTITION to MSCK REPAIR when updating metastore tables in Hive.
# It is not always possible. When a table is initially created, MSCK REPAIR is always used to pick up all partitions.
# Also ADD PARTTITION is only for Parquet format.
# This option can be overridden per metatable.
hive.prefer.add.partition = false
# If enabled, the job will wait for the output table to become available before running a job
# If the number of seconds <=0 the waiting will be infinite
wait.for.output.table.enabled = false
wait.for.output.table.seconds = 600
# How many days to check back for late data
# 0 - never check for updates after the data is loaded
# 1 - check only the current info date if you run the job more than once per day
# 2 - check the latest info date and the date before
# etc...
# You can also set this parameter for individual tables in the metastore.
track.days = 5
# Do not expect data to arrive specified number of days from now
# (This is a DEPRECATED parameter, please do not change th default)
expected.delay.days = 0
# This needs to be specified. The path should be accessible by Hadoop (HDFS, S3, etc)
# temporary.directory = ""
warn.throughput.rps = 2000
good.throughput.rps = 40000
dry.run = false
# If true a more detailed log will be generated
verbose = false
use.lock = true
# Send an email even if there are no changes and no late or not ready data
email.if.no.changes = true
check.only.late.data = false
check.only.new.data = false
# If this is set the workflow will be re-run for the specified information date.
#rerun.info.date =
# If this is set the current date will overridden by the specified value.
#current.date =
#spark.conf = {
# Pass arbitrary Spark Configuration when initializing Spark Session
# For example, alternative way of writing legacy parquet will be
# spark.sql.parquet.writeLegacyFormat = true
# These options are needed so the job could create Hive tables
#hive.metastore.uris = "thrift://host1:9083,thrift://host2:9083"
#spark.sql.warehouse.dir = "/hive/warehouse"
#}
# Other option(s) might be
# spark.conf.spark.sql.parquet.binaryAsString = true
# Default infroamation date expression for daily jobs
default.daily.output.info.date.expr = "@runDate"
# Default infroamation date expression for weekly jobs (Monday of the current week)
default.weekly.output.info.date.expr = "lastMonday(@runDate)"
# Default infroamation date expression for monthly jobs (The first day of the month)
default.monthly.output.info.date.expr = "beginOfMonth(@runDate)"
# Default operation type (from 'ingestion', 'transformation', 'sink', 'transfer')
#default.operation.type = "ingestion"
# Default number of records per partition for metastore tables
#default.records.per.partition = 1000000
# Default minimum dates to start initial data sourcing from a table when no bookkeeping information
# is created for that table
initial.sourcing.date.daily.expr = "@runDate"
initial.sourcing.date.weekly.expr = "@runDate - 6"
initial.sourcing.date.monthly.expr = "beginOfMonth(@runDate)"
# Pramen can stop the Spark session at the end of execution. This can help cleanly finalize running
# jobs started from 'spark-submit'. But when running on Databriks this results in the job failure.
# Use it with caution.
stop.spark.session = false
# Pramen will return a non-zero exit code on failures by default. But on Databricks this causes to end the job
# prematurely and fail.
exit.code.enabled = true
timezone = "Africa/Johannesburg"
# Specifies the list of special characters to replace with '_' in column names if they are encountered on ingestion.
special.characters.in.column.names = "' :+-=<>()[]{}*?/\\\""
# If set to true, the job will log the list of executor nodes.
# It takes some time to determine them, so it is disabled by default.
log.executor.nodes = false
# Init and finalize hooks (optional). Hook classes should extend/implement 'Runnable':
#hook {
# startup.class = "com.example.MyRunnable"
# shutdown.class = "com.example.MyRunnable"
#}
# If true, allows multiple transformers to output to the same metastore table,
# as long as their schedules do not overlap. That is, partitions produced by
# one transformer are different from partitions produced by other transformers.
# You can set it to false to have a more strict setup and disallow this behavior.
enable.multiple.jobs.per.output.table = true
# Limits on certain elements of notiications (email etc)
notifications {
# The maximum length in characters of the Reason field in the completed tasks table.
reason.max.length = 1024
# The maximum length of errors and exceptions in the notification body. The default value is selected
# so that Pramen can handle at least 100 exceptions in a single email notification.
exception.max.length = 65536
}
}
pramen.py {
// Path to Pramen-Py (must be populates in order to suppot Pramen-Py)
#location = ""
executable = "pramen-py"
cmd.line.template = "@location/@executable transformations run @pythonClass -c @metastoreConfig --info-date @infoDate"
keep.log.lines = 2000
}
mail {
# Any options from https://javaee.github.io/javamail/docs/api/com/sun/mail/smtp/package-summary.html
#smtp.host = ""
smtp.port = "25"
smtp.auth = "false"
smtp.starttls.enable = "false"
smtp.EnableSSL.enable = "false"
debug = "false"
send.from = "Pramen "
send.to = ""
}
hadoop.redacted.tokens = [ password, secret, session.token, access.key ]
# Hadoop options to access S3
# hadoop.conf {
# # Authentication provider. Can be
# # * org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider
# # * com.amazonaws.auth.profile.ProfileCredentialsProvider
# # * com.amazonaws.auth.InstanceProfileCredentialsProvider
# # Use he default provider chain. It will use the first authentication provider that succeeds
# fs.s3a.aws.credentials.provider = "com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
#
# # When an AWS profile is used from ~/.aws, you can set
# # AWS_PROFILE to specify the exact profile and
# # AWS_CREDENTIAL_PROFILES_FILE (and AWS_CONFIG_FILE) to specify the .aws config file if it is not ~/.aws/credentials
# # Enable bucket key encruption
# fs.s3a.server-side-encryption-bucket-key-enabled = "true"
#
# # Enable magic committer for all buckets to have the best tradeoff between performance and safety
# fs.s3a.committer.name = "magic"
# fs.s3a.committer.magic.enabled = "true"
#
# # Explicitly specify the endpoint
# # fs.s3a.endpoint = "s3.af-south-1.amazonaws.com"
#
# # Per-bucket endpoint and credentials configuration (for bucket named 'mybucket'):
# fs.s3a.bucket.mybucket.endpoint=http://myendpoint
# fs.s3a.bucket.mybucket.path.style.access=true
# fs.s3a.bucket.mybucket.access.key = "AAABBBAAABBBAAABBBAA111")
# fs.s3a.bucket.mybucket.secret.key = "abc123abc123abc123abc123abc123abc123"
# # AWS credentials
# # fs.s3a.access.key = "AAABBBAAABBBAAABBBAA111")
# # fs.s3a.secret.key = "abc123abc123abc123abc123abc123abc123"
# # The session token for temporary credentials spec
# # fs.s3a.session.token = ""
# }
# Java X configuration (for accessing services vis HTTPS)
# javax.net.ssl.trustStore = ""
# javax.net.ssl.trustStorePassword = ""
# javax.net.ssl.keyStore = ""
# javax.net.ssl.keyStorePassword = ""
# javax.net.ssl.password = ""
# java.security.auth.login.config = ""
# java.security.krb5.conf = ""
# javax.net.debug = ""