All Downloads are FREE. Search and download functionalities are using the official Maven repository.

comet-spark3_2.12.0.1.20.source-code.reference.conf Maven / Gradle / Ivy

There is a newer version: 0.2.6
Show newest version
root = "/tmp"
root = ${?COMET_ROOT}

datasets = ${root}"/datasets"
datasets = ${?COMET_DATASETS}

metadata = ${root}"/metadata"
metadata = ${?COMET_METADATA}

tmpdir = ${root}"/comet_tmp"
tmpdir = ${?COMET_TMPDIR}

archive = true
archive = ${?COMET_ARCHIVE}

launcher = airflow
launcher = simple
launcher = ${?COMET_LAUNCHER}

hive = false
hive = ${?COMET_HIVE}

grouped = false
grouped = ${?COMET_GROUPED}

analyze = true
analyze = ${?COMET_ANALYZE}

# Save Format in CSV with coalesce(1)
csv-output = false
csv-output = ${?COMET_CSV_OUTPUT}

privacy-only = false
privacy-only = ${?COMET_PRIVACY_ONLY}

default-write-format = parquet
default-write-format = ${?COMET_DEFAULT_WRITE_FORMAT}

merge-force-distinct = false
merge-force-distinct = ${?COMET_MERGE_FORCE_DISTINCT}

file-system = "file://"
file-system = ${?COMET_FS}

udfs = ${?COMET_UDFS}

metadata-file-system = ${file-system}
metadata-file-system = ${?COMET_METADATA_FS}

chewer-prefix = "comet.chewer"
chewer-prefix = ${?COMET_CHEWER_PREFIX}

row-validator-class = "com.ebiznext.comet.job.ingest.DsvIngestionUtil"
row-validator-class = ${?COMET_ROW_VALIDATOR_CLASS}

lock {
  path = ${root}"/locks"
  path = ${?COMET_LOCK_PATH}

  ingestion-timeout = -1
  ingestion-timeout = ${?COMET_LOCK_INGESTION_TIMEOUT}

  metrics-timeout = -1
  metrics-timeout = ${?COMET_LOCK_METRICS_TIMEOUT}
}

hadoop {
}

audit {
  path = ${root}"/audit"
  path = ${?COMET_AUDIT_PATH}

  audit-timeout = -1
  audit-timeout = ${?COMET_LOCK_AUDIT_TIMEOUT}

  max-errors = 100
  sink {
    type = "NoneSink" # can be BigQuerySink or JdbcSink or NoneSink or EsSink
    type = ${?COMET_AUDIT_SINK_TYPE}
    name = "audit" // serves as dataset name for BigQuery or Elasticsearch index name

    ## BigQuery options
    # location = "EU"
    # timestamp = "_PARTITIONTIME"
    # clustering = "???"
    # days = 7
    # require-partition-filter = false


    # Jdbc options
    connection = "audit"
    partitions = 1
    batchSize = 1000
  }

}

metrics {
  active = false
  active = ${?COMET_METRICS_ACTIVE}

  #  path = ${root}"/metrics/{domain}/{schema}"
  path = ${root}"/metrics/{domain}"
  path = ${?COMET_METRICS_PATH}

  discrete-max-cardinality = 10
  discrete-max-cardinality = ${?COMET_METRICS_DISCRETE_MAX_CARDINALITY}

  sink {
    type = "NoneSink" # can be BigQuerySink or JdbcSink or NoneSink or EsSink
    type = ${?COMET_METRICS_SINK_TYPE}
    name = "metrics" // serves as dataset name for BigQuery or Elasticsearch index name

    ## BigQuery options
    # location = "EU"
    # timestamp = "_PARTITIONTIME"
    # clustering = "???"
    # days = 7
    # require-partition-filter = false

    # Jdbc options
    connection = "metrics"
    partitions = 1
    batch-size = 1000
  }
}

connections = {
  "audit": {
    "format" = "jdbc"
#    "mode" = "Append"
    options = {
        "url": "jdbc:postgresql://127.0.0.1:5403/comet?user=postgres&password=ficpug-Podbid-7fobnu",
        "user": "postgres",
        "password": "ficpug-Podbid-7fobnu",
        "driver": "org.postgresql.Driver"
        # driver = "org.h2.Driver"
    }
  }
}

jdbc-engines {
  h2 {
    tables = {
      "audit": {
        create-sql = """CREATE TABLE IF NOT EXISTS AUDIT (
                              jobid VARCHAR(255) not NULL,
                              paths VARCHAR(255) not NULL,
                              domain VARCHAR(255) not NULL,
                              schema VARCHAR(255) not NULL,
                              success BOOLEAN not NULL,
                              count BIGINT not NULL,
                              countAccepted BIGINT not NULL,
                              countRejected BIGINT not NULL,
                              timestamp TIMESTAMP not NULL,
                              duration INTEGER not NULL,
                              message VARCHAR(255) not NULL
                             )
    """
      },
      "rejected": {
        create-sql = """CREATE TABLE IF NOT EXISTS REJECTED (
                              jobid VARCHAR(255) not NULL,
                              timestamp TIMESTAMP not NULL,
                              domain VARCHAR(255) not NULL,
                              schema VARCHAR(255) not NULL,
                              error VARCHAR(255) not NULL,
                              path VARCHAR(255) not NULL
                             )
    """
      },
      "frequencies": {
        create-sql = """CREATE TABLE IF NOT EXISTS frequencies (
            attribute VARCHAR(255) not NULL,
            category TEXT NULL,
            count BIGINT not NULL,
            frequency DOUBLE PRECISION NULL,
            jobId VARCHAR(255) not NULL,
            domain VARCHAR(255) not NULL,
            schema VARCHAR(255) not NULL,
            cometTime BIGINT not NULL,
            cometStage VARCHAR(255) not NULL
        )
        """
      },
      "continuous": {
        create-sql = """CREATE TABLE IF NOT EXISTS continuous (
            attribute VARCHAR(255) not NULL,
            min DOUBLE PRECISION NULL,
            max DOUBLE PRECISION NULL,
            mean DOUBLE PRECISION NULL,
            missingValues BIGINT NULL,
            variance DOUBLE PRECISION NULL,
            standardDev DOUBLE PRECISION NULL,
            sum DOUBLE PRECISION NULL,
            skewness DOUBLE PRECISION NULL,
            kurtosis DOUBLE PRECISION NULL,
            percentile25 DOUBLE PRECISION NULL,
            median DOUBLE PRECISION NULL,
            percentile75 DOUBLE PRECISION NULL,
            cometMetric VARCHAR(255) not NULL,
            jobId VARCHAR(255) not NULL,
            domain VARCHAR(255) not NULL,
            schema VARCHAR(255) not NULL,
            count BIGINT not NULL,
            cometTime BIGINT not NULL,
            cometStage VARCHAR(255) not NULL
        )
        """
        },
      "discrete": {
        create-sql = """CREATE TABLE IF NOT EXISTS discrete (
            attribute VARCHAR(255) not NULL,
            countDistinct BIGINT NULL,
            missingValuesDiscrete BIGINT NULL,
            cometMetric VARCHAR(255) not NULL,
            jobId VARCHAR(255) not NULL,
            domain VARCHAR(255) not NULL,
            schema VARCHAR(255) not NULL,
            count BIGINT not NULL,
            cometTime BIGINT not NULL,
            cometStage VARCHAR(255) not NULL
        )
        """
      }
    }
  }

  postgresql {
    tables = {
      "audit": {
        create-sql = """CREATE TABLE IF NOT EXISTS audit (
                              jobid VARCHAR(255) not NULL,
                              paths VARCHAR(255) not NULL,
                              domain VARCHAR(255) not NULL,
                              schema VARCHAR(255) not NULL,
                              success BOOLEAN not NULL,
                              count BIGINT not NULL,
                              countAccepted BIGINT not NULL,
                              countRejected BIGINT not NULL,
                              timestamp TIMESTAMP not NULL,
                              duration INTEGER not NULL,
                              message VARCHAR(255) not NULL
                             )
    """
      },
      "rejected": {
        create-sql = """CREATE TABLE IF NOT EXISTS rejected (
                              jobid VARCHAR(255) not NULL,
                              timestamp TIMESTAMP not NULL,
                              domain VARCHAR(255) not NULL,
                              schema VARCHAR(255) not NULL,
                              error VARCHAR(255) not NULL,
                              path VARCHAR(255) not NULL
                             )
    """
      },
      "frequencies": {
        create-sql = """CREATE TABLE IF NOT EXISTS frequencies (
            attribute VARCHAR(255) not NULL,
            category TEXT NULL,
            count BIGINT not NULL,
            frequency DOUBLE PRECISION NULL,
            jobId VARCHAR(255) not NULL,
            domain VARCHAR(255) not NULL,
            schema VARCHAR(255) not NULL,
            cometTime BIGINT not NULL,
            cometStage VARCHAR(255) not NULL
        )
        """
      },
      "continuous": {
        create-sql = """CREATE TABLE IF NOT EXISTS continuous (
            attribute VARCHAR(255) not NULL,
            min DOUBLE PRECISION NULL,
            max DOUBLE PRECISION NULL,
            mean DOUBLE PRECISION NULL,
            missingValues BIGINT NULL,
            variance DOUBLE PRECISION NULL,
            standardDev DOUBLE PRECISION NULL,
            sum DOUBLE PRECISION NULL,
            skewness DOUBLE PRECISION NULL,
            kurtosis DOUBLE PRECISION NULL,
            percentile25 DOUBLE PRECISION NULL,
            median DOUBLE PRECISION NULL,
            percentile75 DOUBLE PRECISION NULL,
            cometMetric VARCHAR(255) not NULL,
            jobId VARCHAR(255) not NULL,
            domain VARCHAR(255) not NULL,
            schema VARCHAR(255) not NULL,
            count BIGINT not NULL,
            cometTime BIGINT not NULL,
            cometStage VARCHAR(255) not NULL
        )
        """
        },
      "discrete": {
        name = "discrete"
        create-sql = """CREATE TABLE IF NOT EXISTS discrete (
            attribute VARCHAR(255) not NULL,
            countDistinct BIGINT NULL,
            missingValuesDiscrete BIGINT NULL,
            cometMetric VARCHAR(255) not NULL,
            jobId VARCHAR(255) not NULL,
            domain VARCHAR(255) not NULL,
            schema VARCHAR(255) not NULL,
            count BIGINT not NULL,
            cometTime BIGINT not NULL,
            cometStage VARCHAR(255) not NULL
        )
        """
      }
    }
  }
}

area {
  pending = "pending"
  pending = ${?COMET_AREA_PENDING}
  unresolved = "unresolved"
  unresolved = ${?COMET_AREA_UNRESOLVED}
  archive = "archive"
  archive = ${?COMET_AREA_ARCHIVE}
  ingesting = "ingesting"
  ingesting = ${?COMET_AREA_INGESTING}
  accepted = "accepted"
  accepted = ${?COMET_AREA_ACCEPTED}
  rejected = "rejected"
  rejected = ${?COMET_AREA_REJECTED}
  business = "business"
  business = ${?COMET_AREA_BUSINESS}
}

privacy {
  options = {
    "none": "com.ebiznext.comet.privacy.No",
    "hide": "com.ebiznext.comet.privacy.Hide",
    "hide10X": "com.ebiznext.comet.privacy.Hide(\"X\",10)",
    "approxLong20": "com.ebiznext.comet.privacy.ApproxLong(20)",
    "md5": "com.ebiznext.comet.privacy.Md5",
    "sha1": "com.ebiznext.comet.privacy.Sha1",
    "sha256": "com.ebiznext.comet.privacy.Sha256",
    "sha512": "com.ebiznext.comet.privacy.Sha512",
    "initials": "com.ebiznext.comet.privacy.Initials"
  }
}


elasticsearch {
  active = false
  active = ${?COMET_ES_ACTIVE}
  options = {
    "es.nodes": "localhost",
    "es.port": "9200",

    #  net.http.auth.user = ""
    #  net.http.auth.pass = ""

    "es.net.ssl": "false",
    "es.net.ssl.cert.allow.self.signed": "false",

    "es.batch.size.entries": "1000",
    "es.batch.size.bytes": "1mb",
    "es.batch.write.retry.count": "3",
    "es.batch.write.retry.wait": "10s"
  }
}

atlas {
  uri = "http://127.0.0.1:21000"
  uri = ${?COMET_ATLAS_URI}
  user = "admin"
  user = ${?COMET_ATLAS_USER}
  password = "admin"
  password = ${?COMET_ATLAS_PASSWORD}
  owner = "system"
  owner = ${?COMET_ATLAS_OWNER}
}

spark {
  #  sql.hive.convertMetastoreParquet = false
  #extraListeners = com.hortonworks.spark.atlas.SparkAtlasEventTracker
  #sql.queryExecutionListeners = com.hortonworks.spark.atlas.SparkAtlasEventTracker
  #sql.streaming.streamingQueryListeners=com.hortonworks.spark.atlas.SparkAtlasStreamingQueryEventTracker
  #  yarn.principal = "invalid"
  #  yarn.keytab = "invalid"
  #  yarn.principal = ${?SPARK_YARN_PRINCIPAL}
  #  yarn.keytab = ${?SPARK_YARN_KEYTAB}
  debug.maxToStringFields = 100
  #master = "local[*]"
  #  sql.catalogImplementation="hive"
  #  sql.catalogImplementation="in-memory"
  sql.legacy.parquet.datetimeRebaseModeInWrite = "CORRECTED"
  viewsEnabled = "true"
}


# curl -v -H 'Cache-Control: no-cache'  -H 'Content-Type: application/json'  -XPOST localhost:8080/api/experimental/dags/comet_validator/dag_runs -d '{"conf":"{\"key\":\"value\"}"}'

airflow {
  ingest = "comet_ingest"
  ingest = ${?AIRFLOW_INGEST}
  endpoint = "http://127.0.0.1:8080/api/experimental"
  endpoint = ${?AIRFLOW_ENDPOINT}
}


internal {
  # See https://spark.apache.org/docs/latest/api/java/index.html?org/apache/spark/storage/StorageLevel.html
  cache-storage-level = "MEMORY_AND_DISK"
  cache-storage-level = ${?COMET_INTERNAL_CACHE_STORAGE_LEVEL}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy