amen.pramen-core_2.13.1.9.7.source-code.reference.conf Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of pramen-core_2.13 Show documentation
Batch data pipeline management tool
There is a newer version: 1.9.8
# Copyright 2022 ABSA Group Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

pramen {
  ingestion.name = "Unspecified"
  environment.name = "DEV"

  bookkeeping.enabled = "true"

  ## This specifies the location to keep locks and bookkeeping. One of the following must be specified.

  # Use PostgresSQL for bookkeeping
  # bookkeeping.jdbc {
  #   driver = "org.postgresql.Driver"
  #   url = "jdbc:postgresql://host:5433/pramen"
  #   user = "username"
  #   password = "password"
  # }

  # Use MongoDB for bookkeeping
  # bookkeeping.mongodb.connection.string = "mongodb://127.0.0.1"
  # bookkeeping.mongodb.database = "pramen"

  # Use Hadoop (HDFS, S3, etc) for bookkeeping
  # bookkeeping.location = ""

  # Bookkeeping storage format: "delta" or "text"
  bookkeeping.hadoop.format = "text"

  # Default information column date field used for the metastore. Sourced tables will be partitioned by this field.
  information.date.column = "pramen_info_date"
  information.date.format = "yyyy-MM-dd"

  # You can set this option so that Pramen never writes to partitions older than the specified date
  #information.date.start = "2010-01-01"

  # Or you can specify the same option in the number of days from the current calendar date.
  #information.date.max.days.behind = 30

  # If non-zero, specifies how many tasks can be ran in parallel
  parallel.tasks = 1

  # Enables Hive (requires Hive JARs to be in the classpath)
  enable.hive = true

  # The API to use to query Hive. Valid values are: "sql", "spark_catalog"
  hive.api = "sql"

  # When possible prefer ADD PARTITION to MSCK REPAIR when updating metastore tables in Hive.
  # It is not always possible. When a table is initially created, MSCK REPAIR is always used to pick up all partitions.
  # Also ADD PARTTITION is only for Parquet format.
  # This option can be overridden per metatable.
  hive.prefer.add.partition = false

  # If enabled, the job will wait for the output table to become available before running a job
  # If the number of seconds <=0 the waiting will be infinite
  wait.for.output.table.enabled = false
  wait.for.output.table.seconds = 600

  # How many days to check back for late data
  # 0 - never check for updates after the data is loaded
  # 1 - check only the current info date if you run the job more than once per day
  # 2 - check the latest info date and the date before
  # etc...
  # You can also set this parameter for individual tables in the metastore.
  track.days = 5

  # Do not expect data to arrive specified number of days from now
  # (This is a DEPRECATED parameter, please do not change th default)
  expected.delay.days = 0

  # This needs to be specified. The path should be accessible by Hadoop (HDFS, S3, etc)
  # temporary.directory = ""

  warn.throughput.rps = 2000
  good.throughput.rps = 40000

  dry.run = false

  # If true a more detailed log will be generated
  verbose = false

  use.lock = true

  # Send an email even if there are no changes and no late or not ready data
  email.if.no.changes = true

  check.only.late.data = false
  check.only.new.data = false

  # If this is set the workflow will be re-run for the specified information date.
  #rerun.info.date =

  # If this is set the current date will overridden by the specified value.
  #current.date =

  #spark.conf = {
    # Pass arbitrary Spark Configuration when initializing Spark Session

    # For example, alternative way of writing legacy parquet will be
    # spark.sql.parquet.writeLegacyFormat = true

    # These options are needed so the job could create Hive tables
    #hive.metastore.uris = "thrift://host1:9083,thrift://host2:9083"
    #spark.sql.warehouse.dir = "/hive/warehouse"
  #}

  # Other option(s) might be
  # spark.conf.spark.sql.parquet.binaryAsString = true

  # Default infroamation date expression for daily jobs
  default.daily.output.info.date.expr = "@runDate"

  # Default infroamation date expression for weekly jobs (Monday of the current week)
  default.weekly.output.info.date.expr = "lastMonday(@runDate)"

  # Default infroamation date expression for monthly jobs (The first day of the month)
  default.monthly.output.info.date.expr = "beginOfMonth(@runDate)"

  # Default operation type (from 'ingestion', 'transformation', 'sink', 'transfer')
  #default.operation.type = "ingestion"

  # Default number of records per partition for metastore tables
  #default.records.per.partition = 1000000

  # Default minimum dates to start initial data sourcing from a table when no bookkeeping information
  # is created for that table
  initial.sourcing.date.daily.expr = "@runDate"
  initial.sourcing.date.weekly.expr = "@runDate - 6"
  initial.sourcing.date.monthly.expr = "beginOfMonth(@runDate)"

  # Pramen can stop the Spark session at the end of execution. This can help cleanly finalize running
  # jobs started from 'spark-submit'. But when running on Databriks this results in the job failure.
  # Use it with caution.
  stop.spark.session = false

  # Pramen will return a non-zero exit code on failures by default. But on Databricks this causes to end the job
  # prematurely and fail.
  exit.code.enabled = true

  timezone = "Africa/Johannesburg"

  # Specifies the list of special characters to replace with '_' in column names if they are encountered on ingestion.
  special.characters.in.column.names = "' :+-=<>()[]{}*?/\\\""

  # If set to true, the job will log the list of executor nodes.
  # It takes some time to determine them, so it is disabled by default.
  log.executor.nodes = false

  # Init and finalize hooks (optional). Hook classes should extend/implement 'Runnable':
  #hook {
  #  startup.class = "com.example.MyRunnable"
  #  shutdown.class = "com.example.MyRunnable"
  #}

  # If true, allows multiple transformers to output to the same metastore table,
  # as long as their schedules do not overlap. That is, partitions produced by
  # one transformer are different from partitions produced by other transformers.
  # You can set it to false to have a more strict setup and disallow this behavior.
  enable.multiple.jobs.per.output.table = true

  # Limits on certain elements of notiications (email etc)
  notifications {
    # The maximum length in characters of the Reason field in the completed tasks table.
    reason.max.length = 1024
    # The maximum length of errors and exceptions in the notification body. The default value is selected
    # so that Pramen can handle at least 100 exceptions in a single email notification.
    exception.max.length = 65536
  }
}

pramen.py {
  // Path to Pramen-Py (must be populates in order to suppot Pramen-Py)
  #location = ""
  executable = "pramen-py"

  cmd.line.template = "@location/@executable transformations run @pythonClass -c @metastoreConfig --info-date @infoDate"

  keep.log.lines = 2000
}

mail {
  # Any options from https://javaee.github.io/javamail/docs/api/com/sun/mail/smtp/package-summary.html

  #smtp.host = ""
  smtp.port = "25"
  smtp.auth = "false"
  smtp.starttls.enable = "false"
  smtp.EnableSSL.enable = "false"
  debug = "false"

  send.from = "Pramen "
  send.to = ""
}

hadoop.redacted.tokens = [ password, secret, session.token, access.key ]

# Hadoop options to access S3
# hadoop.conf {
#   # Authentication provider. Can be
#   # * org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider
#   # * com.amazonaws.auth.profile.ProfileCredentialsProvider
#   # * com.amazonaws.auth.InstanceProfileCredentialsProvider
#   # Use he default provider chain. It will use the first authentication provider that succeeds
#   fs.s3a.aws.credentials.provider = "com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
#
#   # When an AWS profile is used from ~/.aws, you can set
#   # AWS_PROFILE to specify the exact profile and
#   # AWS_CREDENTIAL_PROFILES_FILE (and AWS_CONFIG_FILE) to specify the .aws config file if it is not ~/.aws/credentials

#   # Enable bucket key encruption
#   fs.s3a.server-side-encryption-bucket-key-enabled = "true"
#
#   # Enable magic committer for all buckets to have the best tradeoff between performance and safety
#   fs.s3a.committer.name = "magic"
#   fs.s3a.committer.magic.enabled = "true"
#
#   # Explicitly specify the endpoint
#   # fs.s3a.endpoint = "s3.af-south-1.amazonaws.com"
#
#   # Per-bucket endpoint and credentials configuration (for bucket named 'mybucket'):
#   fs.s3a.bucket.mybucket.endpoint=http://myendpoint
#   fs.s3a.bucket.mybucket.path.style.access=true
#   fs.s3a.bucket.mybucket.access.key = "AAABBBAAABBBAAABBBAA111")
#   fs.s3a.bucket.mybucket.secret.key = "abc123abc123abc123abc123abc123abc123"

#   # AWS credentials
#   # fs.s3a.access.key = "AAABBBAAABBBAAABBBAA111")
#   # fs.s3a.secret.key = "abc123abc123abc123abc123abc123abc123"
#   # The session token for temporary credentials spec
#   # fs.s3a.session.token = ""
# }

# Java X configuration (for accessing services vis HTTPS)

# javax.net.ssl.trustStore = ""
# javax.net.ssl.trustStorePassword = ""
# javax.net.ssl.keyStore = ""
# javax.net.ssl.keyStorePassword = ""
# javax.net.ssl.password = ""
# java.security.auth.login.config = ""
# java.security.krb5.conf = ""
# javax.net.debug = ""