r.0.27.1.source-code.secor.common.properties Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of secor Show documentation
Kafka to s3/gs/swift logs exporter
There is a newer version: 0.27.2
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

############
# MUST SET #
############

# Regular expression matching names of consumed topics.
secor.kafka.topic_filter=.*
secor.kafka.topic_blacklist=

# Choose what to fill according to the service you are using
# in the choice option you can fill S3, GS, Swift or Azure
cloud.service=S3

# AWS authentication credentials.
# Leave empty if using IAM role-based authentication with s3a filesystem.
aws.access.key=
aws.secret.key=
# Session token only required if using temporary S3 access keys
aws.session.token=
aws.role=

# Optional Proxy Setting. Set to true to enable proxy
# Only applicable to S3UploadManager
aws.proxy.isEnabled=false
aws.proxy.http.host=
aws.proxy.http.port=

################
# END MUST SET #
################


# AWS region or endpoint. region should be a known region name (eg.
# us-east-1). endpoint should be a known S3 endpoint url. If neither
# are specified, then the default region (us-east-1) is used. If both
# are specified then endpoint is used.
#
# Only apply if the the S3UploadManager is used - see
# secor.upload.manager.class.
#
# http://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region
aws.region=
aws.endpoint=

# Toggle the AWS S3 client between virtual host style access and path style
# access. See http://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html
aws.client.pathstyleaccess=false

###########################
# START AWS S3 ENCRYPTION #
###########################

# AWS specify type of server-side encryption, if any
# set to S3 to enable S3-managed encryption
# set to KMS to enable AWS KMS-managed encryption (see aws.sse.kms.key)
# set to customer to enable customer-managed encryption (see aws.sse.customer.key)
# set empty to disable encryption
aws.sse.type=

# Key to use for S3 server-side encryption, base64-encoded
# Note: requires aws.sse.type to be set to customer to be used
aws.sse.customer.key=

# KMS Key to use for S3 server-side encryption, base64-encoded
# Leave empty to use default generated key
# Note: requires aws.sse.type to be set to  KMS to be used
aws.sse.kms.key=

#########################
# END AWS S3 ENCRYPTION #
#########################

# Hadoop filesystem to use. Choices are s3n or s3a.
# See https://wiki.apache.org/hadoop/AmazonS3 for details.
secor.s3.filesystem=s3n

# Swift config, MUST configure if cloud.service=Swift

# Swift Login Details:
swift.use.get.auth=true
swift.auth.url=
swift.tenant=
swift.username=
swift.port=8080
swift.public=true

# only needed if "swift.use.get.auth" = false
swift.password=

# only needed if "swift.use.get.auth" = true
swift.api.key=

# GS config, MUST configure if gcloud.service=GS

# Name of the Google cloud storage bucket where log files are stored.
secor.gs.bucket=secor_gs

# Google cloud storage path where files are stored within the bucket.
secor.gs.path=data

# Use direct uploads
# WARNING: disables resumable uploads, files are uploaded in a single request
# This may help prevent IOException: insufficient data written,
# see https://github.com/pinterest/secor/issues/177
# https://cloud.google.com/storage/docs/json_api/v1/how-tos/upload
secor.gs.upload.direct=false

# Application credentials configuration file
# https://developers.google.com/identity/protocols/application-default-credentials
# It can be empty when secor running in Google Cloud VMs with proper scopes
secor.gs.credentials.path=

# Zookeeper config.
zookeeper.session.timeout.ms=3000
zookeeper.sync.time.ms=200

# Zookeeper path (chroot) under which secor data will be placed.
secor.zookeeper.path=/

# Impacts how frequently the upload logic is triggered if no messages are delivered.
kafka.consumer.timeout.ms=10000

# Where consumer should read from if no committed offset in zookeeper.
#   "smallest" -> read from earliest offset
#   "largest"  -> read from latest offset
# Always use "smallest" unless you know what you're doing and are willing to risk
# data loss for new topics or topics whose number of partitions has changed.
# See the kafka docs for "auto.offset.reset".
kafka.consumer.auto.offset.reset=smallest

# Same as old configuration. Except accepted values are earliest and latest instead of smallest
# and largest
kafka.new.consumer.auto.offset.reset=earliest

# Comma-separated list of topics to consume. Please note that this is not a regular expression.
# If that's what you want, you can use "secor.kafka.topic_filter" instead.
kafka.new.consumer.topic.list=

kafka.new.consumer.poll.timeout.seconds=10

kafka.new.consumer.request.timeout.ms=

kafka.new.consumer.ssl.key.password=

kafka.new.consumer.ssl.keystore.location=

kafka.new.consumer.ssl.keystore.password=

kafka.new.consumer.ssl.truststore.location=

kafka.new.consumer.ssl.truststore.password=

kafka.new.consumer.isolation.level=

kafka.new.consumer.max.poll.records=

kafka.new.consumer.sasl.client.callback.handler.class=

kafka.new.consumer.sasl.jaas.config=

kafka.new.consumer.sasl.kerberos.service.name=

kafka.new.consumer.sasl.login.callback.handler.class=

kafka.new.consumer.sasl.login.class=

kafka.new.consumer.sasl.mechanism=

kafka.new.consumer.security.protocol=

kafka.new.consumer.ssl.enabled.protocols=

kafka.new.consumer.ssl.keystore.type=

kafka.new.consumer.ssl.protocol=

kafka.new.consumer.ssl.provider=

kafka.new.consumer.ssl.truststore.type=

kafka.new.consumer.partition.assignment.strategy.class=

# Choose between range and roundrobin partition assignment strategy for kafka
# high level consumers. Check PartitionAssignor.scala in kafa 821 module for
# the differences between the two.
# In kafka 811, only range strategy is supported.
kafka.partition.assignment.strategy=range

# Max number of retries during rebalance.
kafka.rebalance.max.retries=

# Rebalance backoff.
kafka.rebalance.backoff.ms=

# Kafka consumer receive buffer size (socket.receive.buffer.bytes)
kafka.socket.receive.buffer.bytes=

# Kafka fetch max size (fetch.message.max.bytes)
kafka.fetch.message.max.bytes=

# Kafka fetch min bytes (fetch.fetch.min.bytes)
kafka.fetch.min.bytes=

kafka.fetch.max.bytes=

# Kafka fetch max wait ms (fetch.max.wait.ms)
kafka.fetch.wait.max.ms=

# Port of the broker serving topic partition metadata.
kafka.seed.broker.port=9092

# Zookeeper path at which kafka is registered. In Zookeeper parlance, this is referred
# to as the chroot.
kafka.zookeeper.path=/

#URL of a Confluent Schema Registry: https://docs.confluent.io/current/schema-registry/docs/index.html
#Only acquired used for decoding Avro messages
schema.registry.url=

# Store offset in zookeeper and kafka consumer topic.
# Only used if kafka.offsets.storage is set to "kafka"
# http://kafka.apache.org/documentation.html#oldconsumerconfigs
# Possible values: true or false
kafka.dual.commit.enabled=true

# Storage offset.
# Possible values: "zookeeper" to read offset from zookeeper or "kafka" to read offset from kafka consumer topic
kafka.offsets.storage=zookeeper

include=kafka.properties

# Secor generation is a version that should be incremented during non-backwards-compatible
# Secor releases. Generation number is one of the components of generated log file names.
# Generation number makes sure that outputs of different Secor versions are isolated.
secor.generation=1

# Number of consumer threads per Secor process.
secor.consumer.threads=7

# Consumption rate limit enforced at the process level (not a consumer-thread level).
secor.messages.per.second=10000

# Used by the "backup" consumer group only.
# Number of continuous message offsets that constitute a single offset= partition on s3.
# Example:
#   if set to 10,
#     messages with offsets 0 to 9 will be written to s3 path s3n://.../offset=0/...
#     messages with offsets 10 to 19 will be written to s3 path s3n://.../offset=10/...
#     ...
secor.offsets.per.partition=10000000
secor.offsets.prefix=offset=
# How long does it take for secor to forget a topic partition. Applies to stats generation only.
secor.topic_partition.forget.seconds=600

# Setting the partitioner to use hourly partition
# By default, the partitioner will do daily partition, so the data will be
# written into
#       s3n://.../topic/dt=2015-07-07/
# If this parameter is set to true, the data will be written into
#       s3n://.../topic/dt=2015-07-07/hr=02
# The hour folder ranges from 00 to 23
partitioner.granularity.hour=false
partitioner.granularity.minute=false

partitioner.granularity.date.prefix=dt=
partitioner.granularity.hour.prefix=hr=
partitioner.granularity.minute.prefix=min=

partitioner.granularity.date.format=yyyy-MM-dd
partitioner.granularity.hour.format=HH
partitioner.granularity.minute.format=mm

# how many seconds should the finalizer wait to finalize a partition
partitioner.finalizer.delay.seconds=3600

# During partition finalization, the finalizer will start from the last
# time partition (e.g. dt=2015-07-17) and traverse backwards for n
# partition periods (e.g. dt=2015-07-16, dt=2015-07-15 ...)
# This parameter controls how many partition periods to traverse back
# The default is 10
# secor.finalizer.lookback.periods=10

# If greater than 0, upon startup Secor will clean up directories and files under secor.local.path
# that are older than this value.
secor.local.log.delete.age.hours=-1

# Secor comes with a tool that adds Hive partitions for finalized topics. Currently, we support
# only Hive clusters accessible through Qubole. The token gives access to the Qubole API.
# It is available at https://api.qubole.com/users/edit
qubole.api.token=

# hive tables are generally named after the topics. For instance if the topic
# is request_log the hive table is also called request_log. If you want this
# to be pinlog_request_log you can set this config to "pinlog_". This affects
# all topics.
hive.table.prefix=

# You can also name your hive table directly if your hive table doesn't
# follow the pattern of 
# E.g.  hive.table.name.topic1=table1 to indicate that hive table for
# kafka topic  will be named 

# Secor can export stats such as consumption lag (in seconds and offsets) per topic partition.
# Leave empty to disable this functionality.
tsdb.hostport=

# Regex of topics that are not exported to TSDB.
monitoring.blacklist.topics=

# Prefix of exported stats.
monitoring.prefix=secor

# Monitoring interval.
# Set to 0 to disable - the progress monitor will run once and exit.
monitoring.interval.seconds=0

# Secor can export stats to statsd such as consumption lag (in seconds and offsets) per topic partition.
# Leave empty to disable this functionality.
statsd.hostport=

# Thrift protocol class. It applies to timestamp extractor below and parquet output for thrift messages.
# TBinaryProtocol by default
secor.thrift.protocol.class=

# Thrift message class. It applies to parquet output.
# If all Kafka topics transfer the same thrift message type, set secor.thrift.message.class.*=
secor.thrift.message.class.*=

# If true, the consumer group will be the initial prefix of all
# exported metrics, before `monitoring.prefix` (if set).
#
# Setting to false and use monitoring.prefix can lead to nice paths.
# For example,
#   secor.kafka.group = secor_hr_partition
#   monitoring.prefix = secor.hr
#   statsd.prefixWithConsumerGroup = false
#   => secor.hr.lag.offsets..
#
#   secor.kafka.group = secor_hr_partition
#   monitoring.prefix = secor
#   statsd.prefixWithConsumerGroup = true
#   => secor_hr_partition.secor.lag.offsets..
statsd.prefixWithConsumerGroup=true

# Name of field that contains timestamp for JSON, MessagePack, or Thrift message parser. (1405970352123)
message.timestamp.name=timestamp

# Separator for defining message.timestamp.name in a nested structure. E.g.
# {"meta_data": {"created": "1405911096123", "last_modified": "1405912096123"}, "data": "test"}
# message.timestamp.name=meta_data.created
# message.timestamp.name.separator=.
message.timestamp.name.separator=

# Field ID of the field that contains timestamp for Thrift message parser.
# N.B. setting this past 1 will come with a performance penalty
message.timestamp.id=1

# Data type of the timestamp field for thrift message parser.
# Supports i64 and i32.
message.timestamp.type=i64

# Name of field that contains a timestamp, as a date Format, for JSON. (2014-08-07, Jul 23 02:16:57 2005, etc...)
# Should be used when there is no timestamp in a Long format. Also ignore time zones.
message.timestamp.input.pattern=

# whether timestamp field is required, it should always be required.  But
# for historical reason, we didn't enforce this check, there might exist some
# installations with messages missing timestamp field
message.timestamp.required=true

# To enable compression, set this to a valid compression codec implementing
# org.apache.hadoop.io.compress.CompressionCodec interface, such as
# 'org.apache.hadoop.io.compress.GzipCodec'.
secor.compression.codec=

# To set a custom file extension set this to a valid file suffix, such as
# '.gz', '.part', etc.
secor.file.extension=

# The secor file reader/writer used to read/write the data, by default we write sequence files
secor.file.reader.writer.factory=com.pinterest.secor.io.impl.SequenceFileReaderWriterFactory
#if left blank defaults to \n
secor.file.reader.Delimiter=\n
#if left blank no Delimiter is added. do not use \ as that needs to be escaped and is an escape
#character and not a delimtier.
secor.file.writer.Delimiter=\n

# Max message size in bytes to retrieve via KafkaClient. This is used by ProgressMonitor and PartitionFinalizer.
# This should be set large enough to accept the max message size configured in your kafka broker
# Default is 0.1 MB
secor.max.message.size.bytes=100000

# Class that will manage uploads. Default is to use the hadoop
# interface to S3.
secor.upload.manager.class=com.pinterest.secor.uploader.HadoopS3UploadManager

#Set below property to your timezone, and partitions in s3 will be created as per timezone provided
secor.parser.timezone=UTC

# Transformer class that transform and filters message accordingly.
secor.message.transformer.class=com.pinterest.secor.transformer.IdentityMessageTransformer

# Set below property to true if you want to have the md5hash appended to your s3 path.
# This helps for better partitioning of the data on s3. Which gives better performance while reading and writing on s3
secor.s3.prefix.md5hash=false

# After the given date, secor will upload files to the supplied s3 alternative path
secor.s3.alter.path.date=

# An alternative S3 path for secor to upload files to
secor.s3.alternative.path=

# If enabled, add calls will be made to qubole, otherwise, skip qubole call for finalization
secor.enable.qubole=true

# Timeout value for qubole calls
secor.qubole.timeout.ms=300000

# Topics to upload at a fixed minute mark
secor.kafka.upload_at_minute_mark.topic_filter=

# What the minute mark is. This isn't triggered unless the topic name matches
secor.upload.minute_mark=0

# File age per topic and per partition is checked against secor.max.file.age.seconds by looking at
# the youngest file when true or at the oldest file when false. Setting it to true ensures that files
# are uploaded when data stops comming and sized based policy cannot trigger. Setting it to false
# ensures that files older than secor.max.file.age.seconds are uploaded immediately.
secor.file.age.youngest=true

# Class that manages metric collection.
# Sending metrics to Ostrich is the default implementation.
secor.monitoring.metrics.collector.class=com.pinterest.secor.monitoring.OstrichMetricCollector

# Row group size in bytes for Parquet writers. Specifies how much data will be buffered in memory before flushing a
# block to disk. Larger values allow for larger column chinks which makes it possible to do larger sequential IO.
# Should be aligned with HDFS blocks. Defaults to 128MB in Parquet 1.9.
parquet.block.size=134217728

# Page group size in bytes for Parquet writers. Indivisible unit for columnar data. Smaller data pages allow for more
# fine grained reading but have higher space overhead. Defaults to 1MB in Parquet 1.9.
parquet.page.size=1048576

# Enable or disable dictionary encoding for Parquet writers. The dictionary encoding builds a dictionary of values
# encountered in a given column. Defaults to true in Parquet 1.9.
parquet.enable.dictionary=true

# Enable or disable validation for Parquet writers. Validates records written against the schema. Defaults to false in
# Parquet 1.9.
parquet.validation=false

# User can configure ORC schema for each Kafka topic. Common schema is also possible. This property is mandatory
# if DefaultORCSchemaProvider is used. ORC schema for all the topics should be defined like this:
secor.orc.message.schema.*=struct\,f:array\,g:int>
# Below config used for defining ORC schema provider class name. User can use the custom implementation for orc schema provider
secor.orc.schema.provider=com.pinterest.secor.util.orc.schema.DefaultORCSchemaProvider