comet-spark2_2.11.0.1.11.source-code.reference.conf Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of comet-spark2_2.11 Show documentation
comet-spark2
There is a newer version: 0.2.6
root = "/tmp"
root = ${?COMET_ROOT}

datasets = ${root}"/datasets"
datasets = ${?COMET_DATASETS}

metadata = ${root}"/metadata"
metadata = ${?COMET_METADATA}

tmpdir = ${root}"/comet_tmp"
tmpdir = ${?COMET_TMPDIR}

archive = true
archive = ${?COMET_ARCHIVE}

launcher = airflow
launcher = simple
launcher = ${?COMET_LAUNCHER}

hive = false
hive = ${?COMET_HIVE}

grouped = false
grouped = ${?COMET_GROUPED}

analyze = true
analyze = ${?COMET_ANALYZE}

# Save Format in CSV with coalesce(1) using timestamp
timestamped-csv = false
timestamped-csv = ${?COMET_TIMESTAMPED_CSV}

default-write-format = parquet
default-write-format = ${?COMET_DEFAULT_WRITE_FORMAT}

merge-force-distinct = false
merge-force-distinct = ${?COMET_MERGE_FORCE_DISTINCT}

file-system = "file://"
file-system = ${?COMET_FS}

metadata-file-system = ${file-system}
metadata-file-system = ${?COMET_METADATA_FS}

chewer-prefix = "comet.chewer"
chewer-prefix = ${?COMET_CHEWER_PREFIX}

row-validator-class = "com.ebiznext.comet.job.ingest.DsvIngestionUtil"
row-validator-class = ${?COMET_ROW_VALIDATOR_CLASS}

lock {
  path = ${root}"/locks"
  path = ${?COMET_LOCK_PATH}

  ingestion-timeout = -1
  ingestion-timeout = ${?COMET_LOCK_INGESTION_TIMEOUT}

  metrics-timeout = -1
  metrics-timeout = ${?COMET_LOCK_METRICS_TIMEOUT}
}

hadoop {
}

audit {
  path = ${root}"/audit"
  path = ${?COMET_AUDIT_PATH}

  audit-timeout = -1
  audit-timeout = ${?COMET_LOCK_AUDIT_TIMEOUT}

  max-errors = 100
  index {
    type = "None" # can be BigQuery or Jdbc or None
    type = ${?COMET_AUDIT_INDEX_TYPE}

    # BigQuery options
    bq-dataset = "audit"

    # Jdbc options
    jdbc-connection = "audit"
    partitions = 1
    batchSize = 1000
  }

}

metrics {
  active = false
  active = ${?COMET_METRICS_ACTIVE}

  #  path = ${root}"/metrics/{domain}/{schema}"
  path = ${root}"/metrics/{domain}"
  path = ${?COMET_METRICS_PATH}

  discrete-max-cardinality = 10
  discrete-max-cardinality = ${?COMET_METRICS_DISCRETE_MAX_CARDINALITY}

  index {
    type = "None" # or Jdbc or BigQuery
    type = ${?COMET_METRICS_INDEX_TYPE}

    # BigQuery options
    bq-dataset = "metrics"

    # Jdbc options
    jdbc-connection = "metrics"
    partitions = 1
    batch-size = 1000
  }
}

jdbc = {
  "audit": {
    uri = "jdbc:postgresql://127.0.0.1:5403/comet?user=postgres&password=ficpug-Podbid-7fobnu",
    user = "postgres",
    password = "ficpug-Podbid-7fobnu",
    driver = "org.postgresql.Driver"
  }
}

jdbc-engines {
  h2 {
    driver = "org.h2.Driver"
    tables = {
      "audit": {
        name = "AUDIT"
        create-sql = """CREATE TABLE IF NOT EXISTS AUDIT (
                              jobid VARCHAR(255) not NULL,
                              paths VARCHAR(255) not NULL,
                              domain VARCHAR(255) not NULL,
                              schema VARCHAR(255) not NULL,
                              success BOOLEAN not NULL,
                              count INTEGER not NULL,
                              countAccepted INTEGER not NULL,
                              countRejected INTEGER not NULL,
                              timestamp TIMESTAMP not NULL,
                              duration INTEGER not NULL,
                              message VARCHAR(255) not NULL
                             )
    """
      },
      "rejected": {
        name = "REJECTED"
        create-sql = """CREATE TABLE IF NOT EXISTS REJECTED (
                              jobid VARCHAR(255) not NULL,
                              timestamp TIMESTAMP not NULL,
                              domain VARCHAR(255) not NULL,
                              schema VARCHAR(255) not NULL,
                              error VARCHAR(255) not NULL,
                              path VARCHAR(255) not NULL
                             )
    """
      },
      "metrics": {
        name = "METRICS"
        create-sql = """CREATE TABLE IF NOT EXISTS METRICS (
        domain VARCHAR(255) not NULL,
        schema VARCHAR(255) not NULL,
        attribute VARCHAR(255) not NULL,
        min INTEGER NULL,
        max INTEGER NULL,
        mean DOUBLE PRECISION NULL,
        missingValues INTEGER NULL,
        standardDev DOUBLE PRECISION NULL,
        variance DOUBLE PRECISION NULL,
        sum INTEGER NULL,
        skewness DOUBLE PRECISION NULL,
        kurtosis INTEGER NULL,
        percentile25 INTEGER NULL,
        median INTEGER NULL,
        percentile75 INTEGER NULL,
        catCountFreq TEXT not NULL,
        countDistinct INTEGER NULL,
        missingValuesDiscrete INTEGER NULL,
        count INTEGER not NULL,
        cometTime INTEGER not NULL,
        cometStage VARCHAR(255) not NULL
        )
        """
      }
    }
  }

  postgresql {
    driver = "org.postgresql.Driver"
    tables = {
      "audit": {
        name = "audit"
        create-sql = """CREATE TABLE IF NOT EXISTS audit (
                              jobid VARCHAR(255) not NULL,
                              paths VARCHAR(255) not NULL,
                              domain VARCHAR(255) not NULL,
                              schema VARCHAR(255) not NULL,
                              success BOOLEAN not NULL,
                              count INTEGER not NULL,
                              countAccepted INTEGER not NULL,
                              countRejected INTEGER not NULL,
                              timestamp TIMESTAMP not NULL,
                              duration INTEGER not NULL,
                              message VARCHAR(255) not NULL
                             )
    """
      },
      "rejected": {
        name = "rejected"
        create-sql = """CREATE TABLE IF NOT EXISTS rejected (
                              jobid VARCHAR(255) not NULL,
                              timestamp TIMESTAMP not NULL,
                              domain VARCHAR(255) not NULL,
                              schema VARCHAR(255) not NULL,
                              error VARCHAR(255) not NULL,
                              path VARCHAR(255) not NULL
                             )
    """
      },
      "metrics": {
        name = "metrics"
        create-sql = """CREATE TABLE IF NOT EXISTS metrics (
        domain VARCHAR(255) not NULL,
        schema VARCHAR(255) not NULL,
        attribute VARCHAR(255) not NULL,
        min INTEGER NULL,
        max INTEGER NULL,
        mean DOUBLE PRECISION NULL,
        missingValues INTEGER NULL,
        standardDev DOUBLE PRECISION NULL,
        variance DOUBLE PRECISION NULL,
        sum INTEGER NULL,
        skewness DOUBLE PRECISION NULL,
        kurtosis INTEGER NULL,
        percentile25 INTEGER NULL,
        median INTEGER NULL,
        percentile75 INTEGER NULL,
        catCountFreq TEXT not NULL,
        countDistinct INTEGER NULL,
        missingValuesDiscrete INTEGER NULL,
        count INTEGER not NULL,
        cometTime INTEGER not NULL,
        cometStage VARCHAR(255) not NULL
        )
        """
        // NOTE ON POSTGRES:
        // The intent is that category, countByCategory, frequencies are actually stored as jsonb-typed fields
        // suggested actual implementation is to have metrics be not a table but a writable view on a back-end
        // table where the above fields are actually jsonb rather than varchar, and the conversions are handled
        // by the postgresql engine (as we can't count on the Spark SQL layer to know about JSON, as of 2.4.5)

      }
    }
  }
}

area {
  pending = "pending"
  pending = ${?COMET_AREA_PENDING}
  unresolved = "unresolved"
  unresolved = ${?COMET_AREA_UNRESOLVED}
  archive = "archive"
  archive = ${?COMET_AREA_ARCHIVE}
  ingesting = "ingesting"
  ingesting = ${?COMET_AREA_INGESTING}
  accepted = "accepted"
  accepted = ${?COMET_AREA_ACCEPTED}
  rejected = "rejected"
  rejected = ${?COMET_AREA_REJECTED}
  business = "business"
  business = ${?COMET_AREA_BUSINESS}
}

privacy {
  options = {
    "none": "com.ebiznext.comet.privacy.No",
    "hide": "com.ebiznext.comet.privacy.Hide",
    "hide10X": "com.ebiznext.comet.privacy.Hide(\"X\",10)",
    "approxLong20": "com.ebiznext.comet.privacy.ApproxLong(20)",
    "md5": "com.ebiznext.comet.privacy.Md5",
    "sha1": "com.ebiznext.comet.privacy.Sha1",
    "sha256": "com.ebiznext.comet.privacy.Sha256",
    "sha512": "com.ebiznext.comet.privacy.Sha512",
    "initials": "com.ebiznext.comet.privacy.Initials"
  }
}


elasticsearch {
  active = false
  active = ${?COMET_ES_ACTIVE}
  options = {
    "es.nodes": "localhost",
    "es.port": "9200",

    #  net.http.auth.user = ""
    #  net.http.auth.pass = ""

    "es.net.ssl": "false",
    "es.net.ssl.cert.allow.self.signed": "false",

    "es.batch.size.entries": "1000",
    "es.batch.size.bytes": "1mb",
    "es.batch.write.retry.count": "3",
    "es.batch.write.retry.wait": "10s"
  }
}

atlas {
  uri = "http://127.0.0.1:21000"
  uri = ${?COMET_ATLAS_URI}
  user = "admin"
  user = ${?COMET_ATLAS_USER}
  password = "admin"
  password = ${?COMET_ATLAS_PASSWORD}
  owner = "system"
  owner = ${?COMET_ATLAS_OWNER}
}

spark {
  #  sql.hive.convertMetastoreParquet = false
  #extraListeners = com.hortonworks.spark.atlas.SparkAtlasEventTracker
  #sql.queryExecutionListeners = com.hortonworks.spark.atlas.SparkAtlasEventTracker
  #sql.streaming.streamingQueryListeners=com.hortonworks.spark.atlas.SparkAtlasStreamingQueryEventTracker
  #  yarn.principal = "invalid"
  #  yarn.keytab = "invalid"
  #  yarn.principal = ${?SPARK_YARN_PRINCIPAL}
  #  yarn.keytab = ${?SPARK_YARN_KEYTAB}
  debug.maxToStringFields = 100
  #master = "local[*]"
  #  sql.catalogImplementation="hive"
  #  sql.catalogImplementation="in-memory"
  sql.legacy.parquet.datetimeRebaseModeInWrite = "CORRECTED"
}


# curl -v -H 'Cache-Control: no-cache'  -H 'Content-Type: application/json'  -XPOST localhost:8080/api/experimental/dags/comet_validator/dag_runs -d '{"conf":"{\"key\":\"value\"}"}'

airflow {
  ingest = "comet_ingest"
  ingest = ${?AIRFLOW_INGEST}
  endpoint = "http://127.0.0.1:8080/api/experimental"
  endpoint = ${?AIRFLOW_ENDPOINT}
}


internal {
  # See https://spark.apache.org/docs/latest/api/java/index.html?org/apache/spark/storage/StorageLevel.html
  cache-storage-level = "MEMORY_AND_DISK"
  cache-storage-level = ${?COMET_INTERNAL_CACHE_STORAGE_LEVEL}
}