
tarlake-spark3_2.12.1.3.0.source-code.starlake.json Maven / Gradle / Ivy
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://json.schemastore.org/starlake.json",
"title": "Starlake Data Pipeline",
"description": "JSON Schema for Starlake Data Pipeline",
"type": "object",
"properties": {
"version": {
"type": "integer",
"enum": [1]
}
},
"allOf": [
{
"$ref": "#/definitions/StarlakeV1Base"
}
],
"required": ["version"],
"definitions": {
"ConvertibleToString": {
"anyOf": [
{ "type": "string" },
{ "type": "boolean" },
{ "type": "number" },
{ "type": "integer" },
{ "type": "null" }
]
},
"MergeOnV1": {
"oneOf": [
{
"const": "TARGET",
"description": "TODO"
},
{
"const": "SOURCE_AND_TARGET",
"description": "TODO"
}
]
},
"PrimitiveTypeV1": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Define the value type",
"oneOf": [
{
"const": "string",
"description": "Any string that match the '.*' regex"
},
{
"const": "long",
"description": "Any whole number that match the '[-|+|0-9][0-9]*' regex.\nints are mapped to as 'int' in some database whereas 'longs are mapped as 'bigint' and shorts as 'smallint'"
},
{
"const": "int",
"description": "Any whole number that match the '[-|+|0-9][0-9]*' regex.\nints are mapped to as 'int' in some database whereas 'longs are mapped as 'bigint' and shorts as 'smallint'"
},
{
"const": "short",
"description": "Any whole number that match the '[-|+|0-9][0-9]*' regex.\nints are mapped to as 'int' in some database whereas 'longs are mapped as 'bigint' and shorts as 'smallint'"
},
{
"const": "double",
"description": "Any decimal number that match the '[-+]?\\d*\\.?\\d+[Ee]?[-+]?\\d*' regex"
},
{
"const": "boolean",
"description": "Any string that match the '(?i)true|yes|[y1]<-TF->(?i)false|no|[n0]' regex,\nwhere the value on the left of '<-T' represent true and values on the right of 'F->' represent the false"
},
{
"const": "byte",
"description": "Any single char"
},
{
"const": "date",
"description": "Any date that match the 'yyyy-MM-dd' regex (2023-12-31)"
},
{
"const": "timestamp",
"description": "date/time that match the 'yyyy-MM-dd HH:mm:ss' regex s (2019-12-31 23:59:02).\nFor epoch timestamp, set pattern attribute to 'epoch_second' or 'epoch_milli'"
},
{
"const": "decimal",
"description": "Any floating value that match the '-?\\d*\\.{0,1}\\d+' regex"
},
{
"const": "variant",
"description": "Semi structured data type eq. JSON / XML"
},
{
"const": "struct",
"description": "Any attribute that has children. Set the array to true if this attribute is made of a list of attributes"
}
]
},
"TrimV1": {
"$ref": "#/definitions/ConvertibleToString",
"description": "How to trim the input string",
"oneOf": [
{
"const": "LEFT",
"description": "Remove all leading space chars from the input"
},
{
"const": "RIGHT",
"description": "Remove all trailing spaces from the input"
},
{
"const": "BOTH",
"description": "Remove all leading and trailing spaces from the input"
},
{
"const": "NONE",
"description": "Do not remove leading or trailing spaces from the input"
}
]
},
"TableDdlV1": {
"description": "DDL used to create a table",
"type": "object",
"properties": {
"createSql": {
"$ref": "#/definitions/ConvertibleToString",
"description": "SQL CREATE DDL statement"
},
"pingSql": {
"$ref": "#/definitions/ConvertibleToString",
"description": "How to test if the table exist.\nUse the following statement by default: 'select count(*) from tableName where 1=0'"
},
"selectSql": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Override the default select defined by Starlake"
}
},
"required": ["createSql"]
},
"Materialization": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Table types supported by the Sink option",
"oneOf": [
{
"const": "TABLE",
"description": "SQL Table"
},
{
"const": "VIEW",
"description": "SQL View"
},
{
"const": "MATERIALIZED_VIEW",
"description": "SQL Materialized View"
}
]
},
"TableTypeBase": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Table types supported by the Extract module",
"oneOf": [
{
"const": "TABLE",
"description": "SQl Table"
},
{
"const": "VIEW",
"description": "SQl View"
},
{
"const": "SYSTEM TABLE",
"description": "Database specific system table"
},
{
"const": "MATERIALIZED VIEW",
"description": "SQL Materialized View"
},
{
"const": "GLOBAL TEMPORARY",
"description": ""
},
{
"const": "LOCAL TEMPORARY",
"description": ""
},
{
"const": "ALIAS",
"description": "Table alias"
},
{
"const": "SYNONYM",
"description": "Table synonym"
}
]
},
"TableTypeV1": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Table types supported by the Extract module",
"oneOf": [
{
"$ref": "#/definitions/TableTypeBase"
},
{
"$ref": "#/definitions/ConvertibleToString",
"description": "Any valid table types",
"not": {
"$ref": "#/definitions/TableTypeBase"
}
}
]
},
"TypeV1": {
"type": "object",
"description": "Custom type definition. Custom types are defined in the types/types.sl.yml file",
"properties": {
"name": {
"$ref": "#/definitions/ConvertibleToString",
"description": "unique id for this type"
},
"pattern": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Regex used to validate the input field"
},
"primitiveType": {
"$ref": "#/definitions/PrimitiveTypeV1",
"description": "To what primitive type should this type be mapped.\n This is the memory representation of the type, When saving, this primitive type is mapped to the database specific type. Default: string"
},
"zone": {
"$ref": "#/definitions/ConvertibleToString",
"description": "useful when parsing specific string:\n - double: To parse a french decimal (comma as decimal separator) set it to fr_FR locale.\n- decimal: to set the precision and scale of this number, '38,9' by default.\n- "
},
"sample": {
"$ref": "#/definitions/ConvertibleToString",
"description": "This field makes sure that the pattern matches the value you want to match. This will be checked on startup"
},
"comment": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Describes this type"
},
"ddlMapping": {
"$ref": "#/definitions/MapString",
"description": "Configure here the type mapping for each datawarehouse.\\nWill be used when inferring DDL from schema."
}
},
"required": ["name", "pattern"]
},
"PositionV1": {
"description": "First and last char positions of an attribute in a fixed length record",
"type": "object",
"properties": {
"first": {
"type": "number",
"description": "Zero based position of the first character for this attribute"
},
"last": {
"type": "number",
"description": "Zero based position of the last character to include in this attribute"
}
},
"required": ["first", "last"]
},
"ConnectionV1": {
"description": "Connection properties to a datawarehouse.",
"type": "object",
"properties": {
"type": {
"$ref": "#/definitions/ConvertibleToString",
"description": "aka jdbc, bigquery, snowflake, redshift ..."
},
"sparkFormat": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Set only if you want to use the Spark engine"
},
"quote": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Set only if you want to use the Spark engine"
},
"separator": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Set only if you want to use the Spark engine"
},
"options": {
"$ref": "#/definitions/MapString",
"description": "Connection options"
}
},
"required": ["type"]
},
"DagGenerationConfigV1": {
"description": "Dag configuration.",
"type": "object",
"properties": {
"comment": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Dag config description"
},
"template": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Dag template to use for this config. Usually a .py.j2 file"
},
"filename": {
"$ref": "#/definitions/ConvertibleToString",
"description": "{schedule}, {domain}, {table} in the file name are used for DAG generation purposes"
},
"options": {
"$ref": "#/definitions/MapString",
"description": "DAG generation options"
}
},
"required": ["template", "filename"]
},
"RowLevelSecurityV1": {
"description": "Row level security policy to apply to the output data.",
"type": "object",
"properties": {
"name": {
"$ref": "#/definitions/ConvertibleToString",
"description": "This Row Level Security unique name"
},
"predicate": {
"$ref": "#/definitions/ConvertibleToString",
"description": "The condition that goes to the WHERE clause and limit the visible rows."
},
"grants": {
"description": "user / groups / service accounts to which this security level is applied.\nex : user:[email protected],group:[email protected],serviceAccount:[email protected]",
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
}
},
"description": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Description for this access policy"
}
},
"required": ["name", "grants"]
},
"AccessControlEntryV1": {
"description": "Column level security policy to apply to the attribute.",
"type": "object",
"properties": {
"role": {
"$ref": "#/definitions/ConvertibleToString",
"description": "This role to give to the granted users"
},
"grants": {
"description": "user / groups / service accounts to which this security level is applied.\nex : user:[email protected],group:[email protected],serviceAccount:[email protected]",
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
}
},
"name": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
}
},
"required": ["role", "grants"]
},
"FormatV1": {
"$ref": "#/definitions/ConvertibleToString",
"description": "DSV by default. Supported file formats are :\\n- DSV : Delimiter-separated values file. Delimiter value is specified in the \"separator\" field.\\n- POSITION : FIXED format file where values are located at an exact position in each line.\\n- JSON_FLAT : For optimisation purpose, we differentiate JSON with top level values from JSON\\n with deep level fields. JSON_FLAT are JSON files with top level fields only.\\n- JSON : Deep JSON file. Use only when your json documents contain sub-documents, otherwise prefer to\\n use JSON_FLAT since it is much faster.\\n- XML : XML files",
"oneOf": [
{
"const": "DSV",
"description": "any single or multiple character delimited file. Separator is specified in the separator field"
},
{
"const": "POSITION",
"description": "any fixed position file. Positions are specified in the position field"
},
{
"const": "JSON",
"description": "any deep json file.\nTo improve performance, prefer the JSON_FLAT format if your json documents are flat"
},
{
"const": "JSON_ARRAY",
"description": "any json file containing an array of json objects."
},
{
"const": "JSON_FLAT",
"description": "any flat json file.\nTo improve performance, prefer this format if your json documents are flat"
},
{
"const": "XML",
"description": "any xml file. Use the metadata.xml.rowTag field to specify the root tag of your xml file"
},
{
"const": "TEXT_XML",
"description": "TODO"
},
{
"const": "KAFKA",
"description": "TODO"
},
{
"const": "KAFKASTREAM",
"description": "TODO"
},
{
"const": "GENERIC",
"description": "TODO"
},
{
"const": "PARQUET",
"description": "TODO"
}
]
},
"MapString": {
"type": "object",
"description": "Map of string",
"additionalProperties": {
"$ref": "#/definitions/ConvertibleToString"
}
},
"MapConnectionV1": {
"type": "object",
"description": "Map of jdbc engines",
"additionalProperties": {
"$ref": "#/definitions/ConnectionV1"
}
},
"MapJdbcEngineV1": {
"type": "object",
"description": "Map of jdbc engines",
"additionalProperties": {
"$ref": "#/definitions/JdbcEngineV1"
}
},
"MapTableDdlV1": {
"type": "object",
"description": "Map of table ddl",
"additionalProperties": {
"$ref": "#/definitions/TableDdlV1"
}
},
"JdbcEngineV1": {
"type": "object",
"description": "Jdbc engine",
"properties": {
"tables": {
"$ref": "#/definitions/MapTableDdlV1",
"description": "List of all SQL create statements used to create audit tables for this JDBC engine.\nTables are created only if the execution of the pingSQL statement fails"
},
"quote": {
"type": "string",
"description": "TODO"
},
"viewPrefix": {
"type": "string",
"description": "TODO"
},
"preActions": {
"type": "string",
"description": "TODO"
},
"strategyBuilder": {
"type": "string",
"description": "TODO"
},
"columnRemarks": {
"type": "string",
"description": "TODO"
},
"tableRemarks": {
"type": "string",
"description": "TODO"
}
},
"required": ["tables", "quote", "strategyBuilder"]
},
"PrivacyV1": {
"type": "object",
"properties": {
"options": {
"$ref": "#/definitions/MapString",
"description": "Privacy strategies. The following default strategies are defined by default:\n- none: Leave the data as is\n- hide: replace the data with an empty string\n- hideX(\"s\", n): replace the string with n occurrences of the string 's'\n- md5: Redact the data using the MD5 algorithm\n- sha1: Redact the data using the SHA1 algorithm\n- sha256: Redact the data using the SHA256 algorithm\n - sha512: Redact the data using the SHA512 algorithm\n- initials: keep only the first char of each word in the data"
}
}
},
"InternalV1": {
"type": "object",
"description": "configure Spark internal options",
"properties": {
"cacheStorageLevel": {
"$ref": "#/definitions/ConvertibleToString",
"description": "How the RDD are cached. Default is MEMORY_AND_DISK_SER.\nAvailable options are (https://spark.apache.org/docs/latest/api/java/index.html?org/apache/spark/storage/StorageLevel.html):\n- MEMORY_ONLY\n- MEMORY_AND_DISK\n- MEMORY_ONLY_SER\n- MEMORY_AND_DISK_SER\n- DISK_ONLY\n- OFF_HEAP"
},
"intermediateBigqueryFormat": {
"$ref": "#/definitions/ConvertibleToString",
"description": "May be parquet or ORC. Default is parquet. Used for BigQuery intermediate storage. Use ORC for for JSON files to keep the original data structure.\nhttps://stackoverflow.com/questions/53674838/spark-writing-parquet-arraystring-converts-to-a-different-datatype-when-loadin"
},
"temporaryGcsBucket": {
"$ref": "#/definitions/ConvertibleToString",
"description": "The GCS bucket that temporarily holds the data before it is loaded to BigQuery."
},
"substituteVars": {
"description": "Internal use. Do not modify.",
"type": "boolean"
},
"bqAuditSaveInBatchMode": {
"description": "TODO",
"type": "boolean"
}
}
},
"AccessPoliciesV1": {
"type": "object",
"properties": {
"apply": {
"description": "Should access policies be enforced ?",
"type": "boolean"
},
"location": {
"$ref": "#/definitions/ConvertibleToString",
"description": "GCP project location. Required if apply is true."
},
"database": {
"$ref": "#/definitions/ConvertibleToString",
"description": "GCP Project id. Required if apply is true."
},
"taxonomy": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Taxonomy name. Required if apply is true."
}
}
},
"SparkSchedulingV1": {
"type": "object",
"properties": {
"maxJobs": {
"description": "Max number of Spark jobs to run in parallel, default is 1",
"type": "integer"
},
"poolName": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Pool name to use for Spark jobs, default is 'default'"
},
"mode": {
"$ref": "#/definitions/ConvertibleToString",
"description": "This can be FIFO or FAIR, to control whether jobs within the pool queue up behind each other (the default) or share the pool’s resources fairly."
},
"file": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Scheduler filename in the metadata folder. If not set, defaults to fairscheduler.xml."
}
}
},
"ExpectationsConfigV1": {
"type": "object",
"properties": {
"path": {
"$ref": "#/definitions/ConvertibleToString",
"description": "When using filesystem storage, the path to the expectations file"
},
"active": {
"description": "should expectations be executed ?",
"type": "boolean"
},
"failOnError": {
"description": "should load / transform fail on expectation error ?",
"type": "boolean"
}
}
},
"MetricsV1": {
"type": "object",
"properties": {
"path": {
"$ref": "#/definitions/ConvertibleToString",
"description": "When using filesystem storage, the path to the metrics file"
},
"discreteMaxCardinality": {
"description": "Max number of unique values accepted for a discrete column. Default is 10",
"type": "integer"
},
"active": {
"description": "Should metrics be computed ?",
"type": "boolean"
}
}
},
"AllSinksV1": {
"type": "object",
"properties": {
"connectionRef": {
"$ref": "#/definitions/ConvertibleToString",
"description": "JDBC: Connection String"
},
"clustering": {
"description": "FS or BQ: List of attributes to use for clustering",
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
}
},
"days": {
"type": "number",
"description": "BQ: Number of days before this table is set as expired and deleted. Never by default."
},
"requirePartitionFilter": {
"type": "boolean",
"description": "BQ: Should be require a partition filter on every request ? No by default."
},
"materializedView": {
"$ref": "#/definitions/Materialization",
"description": "Should we materialize as a table or as a view when saving the results ? TABLE by default."
},
"enableRefresh": {
"type": "boolean",
"description": "BQ: Enable automatic refresh of materialized view ? false by default."
},
"refreshIntervalMs": {
"type": "number",
"description": "BQ: Refresh interval in milliseconds. Default to BigQuery default value"
},
"id": {
"$ref": "#/definitions/ConvertibleToString",
"description": "ES: Attribute to use as id of the document. Generated by Elasticsearch if not specified."
},
"format": {
"$ref": "#/definitions/ConvertibleToString",
"description": "FS: File format"
},
"extension": {
"$ref": "#/definitions/ConvertibleToString",
"description": "FS: File extension"
},
"partition": {
"type": "array",
"items": {
"type": "string"
},
"description": "FS or BQ: List of partition attributes"
},
"coalesce": {
"type": "boolean",
"description": "When outputting files, should we coalesce it to a single file. Useful when CSV is the output format."
},
"path": {
"type": "string",
"description": "Optional path attribute if you want to save the file outside of the default location (datasets folder)"
},
"options": {
"$ref": "#/definitions/MapString",
"description": "spark options to use"
}
}
},
"WriteStrategyTypeBase": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO",
"oneOf": [
{
"const": "OVERWRITE",
"description": "TODO"
},
{
"const": "APPEND",
"description": "TODO"
},
{
"const": "UPSERT_BY_KEY",
"description": "TODO"
},
{
"const": "UPSERT_BY_KEY_AND_TIMESTAMP",
"description": "TODO"
},
{
"const": "DELETE_THEN_INSERT",
"description": "TODO"
},
{
"const": "SCD2",
"description": "TODO"
},
{
"const": "OVERWRITE_BY_PARTITION",
"description": "TODO"
}
]
},
"WriteStrategyTypeV1": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Table types supported by the Extract module",
"oneOf": [
{
"$ref": "#/definitions/WriteStrategyTypeBase"
}
]
},
"OpenWriteStrategyTypeV1": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Table types supported by the Extract module",
"oneOf": [
{
"$ref": "#/definitions/WriteStrategyTypeBase"
},
{
"$ref": "#/definitions/ConvertibleToString",
"description": "Any valid table types",
"not": {
"$ref": "#/definitions/WriteStrategyTypeBase"
}
}
]
},
"WriteStrategyV1": {
"type": "object",
"properties": {
"type": {
"$ref": "#/definitions/WriteStrategyTypeV1",
"description": "TODO"
},
"types": {
"type": "object",
"description": "TODO",
"additionalProperties": {
"$ref": "#/definitions/OpenWriteStrategyTypeV1"
}
},
"key": {
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
},
"description": "TODO"
},
"timestamp": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
},
"queryFilter": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
},
"on": {
"$ref": "#/definitions/MergeOnV1",
"description": "TODO"
},
"startTs": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
},
"endTs": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
}
}
},
"MetadataV1": {
"type": "object",
"properties": {
"format": {
"$ref": "#/definitions/FormatV1"
},
"encoding": {
"$ref": "#/definitions/ConvertibleToString",
"description": "UTF-8 if not specified."
},
"multiline": {
"type": "boolean",
"description": "are json objects on a single line or multiple line ? Single by default. false means single. false also means faster"
},
"array": {
"type": "boolean",
"description": "Is the json stored as a single object array ? false by default. This means that by default we have on json document per line."
},
"withHeader": {
"type": "boolean",
"description": "does the dataset has a header ? true by default"
},
"separator": {
"$ref": "#/definitions/ConvertibleToString",
"description": "the values delimiter, ';' by default value may be a multichar string starting from Spark3"
},
"quote": {
"$ref": "#/definitions/ConvertibleToString",
"description": "The String quote char, '\"' by default"
},
"escape": {
"$ref": "#/definitions/ConvertibleToString",
"description": "escaping char '\\' by default"
},
"sink": {
"$ref": "#/definitions/AllSinksV1"
},
"ignore": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Pattern to ignore or UDF to apply to ignore some lines"
},
"directory": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Folder on the local filesystem where incoming files are stored.\n Typically, this folder will be scanned periodically to move the dataset to the cluster for ingestion.\n Files located in this folder are moved to the stage folder for ingestion by the \"import\" command."
},
"extensions": {
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
},
"description": "recognized filename extensions. json, csv, dsv, psv are recognized by default.\nOnly files with these extensions will be moved to the stage folder."
},
"ack": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Ack extension used for each file. \".ack\" if not specified.\nFiles are moved to the stage folder only once a file with the same name as the source file and with this extension is present.\nTo move a file without requiring an ack file to be present, set explicitly this property to the empty string value \"\"."
},
"options": {
"$ref": "#/definitions/MapString",
"description": "Options to add to the spark reader"
},
"loader": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Loader to use, 'spark' or 'native'. Default to 'spark' of SL_LOADER env variable is set to 'native'"
},
"emptyIsNull": {
"description": "Treat empty columns as null in DSV files. Default to false",
"type": "boolean"
},
"dagRef": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Cron expression to use for this domain/table"
},
"freshness": {
"$ref": "#/definitions/FreshnessV1",
"description": "Configure freshness checks on this dataset"
},
"nullValue": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Treat a specific input string as a null value indicator"
},
"schedule": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Cron expression to use for this domain/table"
},
"writeStrategy": {
"$ref": "#/definitions/WriteStrategyV1",
"description": "TODO"
}
}
},
"AreaV1": {
"type": "object",
"properties": {
"incoming": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Files are read from this folder for ingestion by the \"import\" command."
},
"stage": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Files recognized by the extensions property are moved to this folder for ingestion by the \"import\" command."
},
"unresolved": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Files that cannot be ingested (do not match by any table pattern) are moved to this folder."
},
"archive": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Files that have been ingested are moved to this folder if SL_ARCHIVE is set to true."
},
"ingesting": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Files that are being ingested are moved to this folder."
},
"replay": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Invalid records are stored in this folder in source format when SL_SINK_REPLAY_TO_FILE is set to true."
},
"hiveDatabase": {
"$ref": "#/definitions/ConvertibleToString",
"description": ""
}
}
},
"FreshnessV1": {
"type": "object",
"properties": {
"warn": {
"$ref": "#/definitions/ConvertibleToString",
"description": "How old may be the data before a warning is raised. Use syntax like '3 day' or '2 hour' or '30 minute'"
},
"error": {
"$ref": "#/definitions/ConvertibleToString",
"description": "How old may be the data before an error is raised. Use syntax like '3 day' or '2 hour' or '30 minute'"
}
}
},
"TableV1": {
"type": "object",
"properties": {
"name": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Schema name, must be unique among all the schemas belonging to the same domain.\n * Will become the hive table name On Premise or BigQuery Table name on GCP."
},
"pattern": {
"$ref": "#/definitions/ConvertibleToString",
"description": "filename pattern to which this schema must be applied.\n * This instructs the framework to use this schema to parse any file with a filename that match this pattern."
},
"attributes": {
"description": "Attributes parsing rules.",
"type": "array",
"items": {
"$ref": "#/definitions/AttributeV1"
}
},
"metadata": {
"$ref": "#/definitions/MetadataV1",
"description": "Dataset metadata"
},
"comment": {
"$ref": "#/definitions/ConvertibleToString",
"description": "free text"
},
"presql": {
"type": "array",
"description": "Reserved for future use.",
"items": {
"$ref": "#/definitions/ConvertibleToString"
}
},
"postsql": {
"type": "array",
"description": "Reserved for future use.",
"items": {
"$ref": "#/definitions/ConvertibleToString"
}
},
"tags": {
"description": "Set of string to attach to this Schema",
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
}
},
"rls": {
"description": " Row level security on this schema.",
"type": "array",
"items": {
"$ref": "#/definitions/RowLevelSecurityV1"
}
},
"expectations": {
"description": "Expectations to check after Load / Transform has succeeded",
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
}
},
"primaryKey": {
"description": "List of columns that make up the primary key",
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
}
},
"acl": {
"description": "Map of rolename -> List[Users].",
"type": "array",
"items": {
"$ref": "#/definitions/AccessControlEntryV1"
}
},
"rename": {
"$ref": "#/definitions/ConvertibleToString",
"description": "If present, the table is renamed with this name. Useful when use in conjunction with the 'extract' module"
},
"sample": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Store here a couple of records illustrating the table data."
},
"filter": {
"$ref": "#/definitions/ConvertibleToString",
"description": "remove all records that do not match this condition"
},
"patternSample": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Sample of filename matching this schema"
}
},
"required": ["name", "pattern", "attributes"]
},
"MetricTypeV1": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO",
"oneOf": [
{
"const": "DISCRETE",
"description": "TODO"
},
{
"const": "CONTINUOUS",
"description": "TODO"
},
{
"const": "TEXT",
"description": "TODO"
},
{
"const": "NONE",
"description": "TODO"
}
]
},
"AttributeV1": {
"type": "object",
"properties": {
"name": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Attribute name as defined in the source dataset and as received in the file"
},
"type": {
"$ref": "#/definitions/ConvertibleToString",
"description": "semantic type of the attribute. Default to string"
},
"array": {
"type": "boolean",
"description": "Is it an array ?"
},
"required": {
"type": "boolean",
"description": "Should this attribute always be present in the source. Default to true."
},
"privacy": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Should this attribute be applied a privacy transformation at ingestion time"
},
"comment": {
"$ref": "#/definitions/ConvertibleToString",
"description": "free text for attribute description"
},
"rename": {
"$ref": "#/definitions/ConvertibleToString",
"description": "If present, the attribute is renamed with this name"
},
"sample": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Sample data for this attribute"
},
"metricType": {
"$ref": "#/definitions/MetricTypeV1",
"description": "If present, what kind of stat should be computed for this field"
},
"attributes": {
"type": "array",
"description": "List of sub-attributes (valid for JSON and XML files only)",
"items": {
"$ref": "#/definitions/AttributeV1"
}
},
"position": {
"$ref": "#/definitions/PositionV1"
},
"default": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Default value for this attribute when it is not present."
},
"tags": {
"type": "array",
"description": "Tags associated with this attribute",
"items": {
"$ref": "#/definitions/ConvertibleToString"
}
},
"trim": {
"$ref": "#/definitions/TrimV1"
},
"script": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Scripted field : SQL request on renamed column"
},
"foreignKey": {
"$ref": "#/definitions/ConvertibleToString",
"description": "If this attribute is a foreign key, reference to [domain.]table[.attribute]"
},
"ignore": {
"type": "boolean",
"description": "Should this attribute be ignored on ingestion. Default to false"
},
"accessPolicy": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Policy tag to assign to this attribute. Used for column level security"
}
},
"required": ["name"]
},
"AutoTaskDescV1": {
"type": "object",
"properties": {
"name": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Main SQL request to execute (do not forget to prefix table names with the database name to avoid conflicts)"
},
"sql": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Main SQL request to execute (do not forget to prefix table names with the database name to avoid conflicts)"
},
"database": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Output Database (refer to a project id in BigQuery). Default to SL_DATABASE env var if set."
},
"domain": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Output domain in output Area (Will be the Database name in Hive or Dataset in BigQuery)"
},
"table": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Dataset Name in output Area (Will be the Table name in Hive & BigQuery)"
},
"partition": {
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
},
"description": "List of columns used for partitioning the output."
},
"presql": {
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
},
"description": "List of SQL requests to executed before the main SQL request is run"
},
"postsql": {
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
},
"description": "List of SQL requests to executed after the main SQL request is run"
},
"sink": {
"$ref": "#/definitions/AllSinksV1"
},
"rls": {
"type": "array",
"items": {
"$ref": "#/definitions/RowLevelSecurityV1"
}
},
"expectations": {
"description": "Expectations to check after Load / Transform has succeeded",
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
}
},
"acl": {
"description": "Map of rolename -> List[Users].",
"type": "array",
"items": {
"$ref": "#/definitions/AccessControlEntryV1"
}
},
"comment": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Output table description"
},
"freshness": {
"$ref": "#/definitions/FreshnessV1",
"description": "Configure freshness checks on the output table"
},
"attributes": {
"description": "Attributes comments and access policies",
"type": "array",
"items": {
"$ref": "#/definitions/AttributeDescV1"
}
},
"python": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Python script URI to execute instead of the SQL request"
},
"tags": {
"description": "Set of string to attach to the output table",
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
}
},
"writeStrategy": {
"$ref": "#/definitions/WriteStrategyV1",
"description": "TODO"
},
"schedule": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Cron expression to use for this task"
},
"dagRef": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Cron expression to use for this domain/table"
},
"taskTimeoutMs": {
"type": "integer",
"description": "Number of milliseconds before a communication timeout."
},
"parseSQL": {
"type": "boolean",
"description": "Should we parse this SQL make it update the table according to write strategy or just execute it ?"
},
"connectionRef": {
"type": "string",
"description": "Used when the default connection ref present in the application.sl.yml file is not the one to use to run the SQL request for this task."
}
},
"required": []
},
"DurationUnit": {
"$ref": "#/definitions/ConvertibleToString",
"oneOf": [
{
"const": "NANOSECONDS"
},
{
"const": "MICROSECONDS"
},
{
"const": "MILLISECONDS"
},
{
"const": "SECONDS"
},
{
"const": "MINUTES"
},
{
"const": "HOURS"
},
{
"const": "DAYS"
}
]
},
"Duration": {
"type": "object",
"properties": {
"length": {
"type": "integer"
},
"unit": {
"$ref": "#/definitions/DurationUnit"
},
"finite": {
"type": "boolean",
"default": true
}
}
},
"LockV1": {
"type": "object",
"properties": {
"path": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Name of the lock"
},
"timeout": {
"type": "integer",
"description": "TODO"
},
"pollTime": {
"$ref": "#/definitions/Duration",
"description": "TODO. Default 5 seconds"
},
"refreshTime": {
"$ref": "#/definitions/Duration",
"description": "TODO. Default 5 seconds"
}
}
},
"AuditV1": {
"type": "object",
"properties": {
"path": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Main SQL request to execute (do not forget to prefix table names with the database name to avoid conflicts)"
},
"sink": {
"$ref": "#/definitions/AllSinksV1",
"description": "Output Database (refer to a project id in BigQuery). Default to SL_DATABASE env var if set."
},
"maxErrors": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Output domain in output Area (Will be the Database name in Hive or Dataset in BigQuery)"
},
"database": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Dataset Name in output Area (Will be the Table name in Hive & BigQuery)"
},
"domain": {
"$ref": "#/definitions/ConvertibleToString"
},
"domainExpectation": {
"$ref": "#/definitions/ConvertibleToString"
},
"domainRejected": {
"$ref": "#/definitions/ConvertibleToString"
},
"active": {
"type": "boolean",
"description": "Output table description"
},
"sql": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
}
},
"required": []
},
"AttributeDescV1": {
"type": "object",
"properties": {
"name": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Column name"
},
"type": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Column Type"
},
"comment": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Column description"
},
"accessPolicy": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Access policy to apply to this column"
}
},
"required": ["name"]
},
"DomainV1": {
"type": "object",
"description": "A schema in JDBC database or a folder in HDFS or a dataset in BigQuery.",
"properties": {
"name": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Domain name. Make sure you use a name that may be used as a folder name on the target storage.\n - When using HDFS or Cloud Storage, files once ingested are stored in a sub-directory named after the domain name.\n - When used with BigQuery, files are ingested and sorted in tables under a dataset named after the domain name."
},
"metadata": {
"$ref": "#/definitions/MetadataV1"
},
"comment": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Domain Description (free text)"
},
"tags": {
"description": "Set of string to attach to this domain",
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
}
},
"rename": {
"$ref": "#/definitions/ConvertibleToString",
"description": "If present, the attribute is renamed with this name"
},
"database": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Output Database (refer to a project id in BigQuery). Default to SL_DATABASE env var if set."
}
}
},
"AutoJobDescV1": {
"type": "object",
"properties": {
"name": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Optional name. If not specified, the name of the file without the extension is used."
},
"tasks": {
"type": "array",
"items": {
"$ref": "#/definitions/AutoTaskDescV1",
"description": "List of transform tasks to execute"
}
},
"comment": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Optional description."
},
"default": {
"$ref": "#/definitions/AutoTaskDescV1",
"description": "Default task properties to apply to all tasks defined in tasks section and in included files"
}
}
},
"JDBCTableV1": {
"type": "object",
"properties": {
"name": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Table name. Set to '*' to extract all tables. Scope: Schema and Data extraction."
},
"sql": {
"$ref": "#/definitions/ConvertibleToString",
"description": "SQL used to extract schema and data. Ignore columns attribute if set."
},
"columns": {
"description": "List of columns to extract. All columns by default.",
"type": "array",
"minItems": 1,
"items": {
"oneOf": [
{
"$ref": "#/definitions/ConvertibleToString",
"description": "Column name to extract. Scope: Schema and Data extraction."
},
{
"type": "object",
"properties": {
"name": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Column name to extract. Scope: Schema and Data extraction."
},
"rename": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Rename database column name. Scope: Schema and Data extraction."
}
},
"required": ["name"]
}
]
}
},
"partitionColumn": {
"$ref": "#/definitions/ConvertibleToString"
},
"numPartitions": {
"type": "integer"
},
"connectionOptions": {
"$ref": "#/definitions/MapString"
},
"fetchSize": {
"type": "integer"
},
"fullExport": {
"type": "boolean"
},
"filter": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Filter applied along data extraction. Scope: Data extraction"
},
"stringPartitionFunc": {
"$ref": "#/definitions/ConvertibleToString",
"description": "SQL template used on partition column's of type String. Some implementations are already defined, see ai.starlake.extract.JdbcDbUtils.getStringPartitionFunc. Mandatory variables: col, nb_partitions. Scope: Data extraction."
}
}
},
"OutputV1": {
"type": "object",
"properties": {
"encoding": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Specifies encoding (charset) of saved CSV files."
},
"withHeader": {
"type": "boolean",
"description": "If true, writes the names of columns as the first line."
},
"separator": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Character used as a separator for each field and value. TODO: check if we are expecting character."
},
"quote": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Character used for escaping quoted values where the separator can be part of the value. TODO: check if we are expecting character."
},
"escape": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Character used for escaping quotes inside an already quoted value. TODO: check if we are expecting character."
},
"nullValue": {
"$ref": "#/definitions/ConvertibleToString",
"description": "String representation of a null value."
},
"datePattern": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Java date pattern to apply on date object. Have a look at https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html"
},
"timestampPattern": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Java timestamp pattern to apply on timestamp object. Have a look at https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html"
}
}
},
"JDBCSchemaBase": {
"type": "object",
"properties": {
"catalog": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Optional catalog name in the source database. Scope: Schema and Data extraction."
},
"schema": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Database schema where source tables are located. For mysql use this instead of catalog. Scope: Schema and Data extraction."
},
"tableRemarks": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Query template used to retrieve table remark. Available variables: catalog, schema, table. Scope: Schema extraction."
},
"columnRemarks": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Query template used to retrieve all columns' remark of a table. Available variables: catalog, schema, table. Scope: Schema extraction."
},
"tableTypes": {
"description": "One or many of the predefined table types. Scope: Schema and Data extraction.",
"type": "array",
"items": {
"$ref": "#/definitions/TableTypeV1"
}
},
"template": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Template used during schema extraction in order to generate load files (domain and tables). Scope: Schema extraction."
},
"pattern": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Pattern template used to define load tables' file pattern. Available variables: catalog, schema, table. Scope: Schema extraction."
},
"numericTrim": {
"$ref": "#/definitions/TrimV1",
"description": "Trim strategies applied to numeric fields set on load table's definition. Scope: Schema extraction."
},
"partitionColumn": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Column to use in order to parallelize data extraction. Scope: Data extraction."
},
"numPartitions": {
"type": "integer",
"description": "Number of data partitions to create. Scope: Data extraction."
},
"connectionOptions": {
"$ref": "#/definitions/MapString",
"description": "Options to set on data connection. Scope: Data extraction."
},
"fetchSize": {
"type": "integer",
"description": "Number of rows to be fetched from the database when additional rows are needed. By default, most JDBC drivers use a fetch size of 10, so if you are reading 1000 objects, increasing the fetch size to 256 can significantly reduce the time required to fetch the query's results. The optimal fetch size is not always obvious. Scope: Data extraction."
},
"stringPartitionFunc": {
"$ref": "#/definitions/ConvertibleToString",
"description": "SQL template used on partition column's of type String. Some implementations are already defined, see ai.starlake.extract.JdbcDbUtils.getStringPartitionFunc. Mandatory variables: col, nb_partitions. Scope: Data extraction."
},
"fullExport": {
"type": "boolean",
"description": "Define if we should fetch the entire table's or not. If not, maximum value of partitionColumn seen during last extraction is used in order to fetch incremental data. Scope: Data extraction."
},
"sanitizeName": {
"type": "boolean",
"description": "Sanitize domain's name by keeping alpha numeric characters only. Scope: Schema and Data extraction."
}
}
},
"DefaultJDBCSchemaV1": {
"$ref": "#/definitions/JDBCSchemaBase"
},
"JDBCSchemaV1": {
"type": "object",
"allOf": [
{
"$ref": "#/definitions/JDBCSchemaBase"
},
{
"properties": {
"tables": {
"type": "array",
"description": "List of tables to extract. Scope: Schema and Data extraction.",
"items": {
"$ref": "#/definitions/JDBCTableV1"
}
},
"exclude": {
"type": "array",
"description": "List of tables to exclude. Applied on tables list. Scope: Schema and Data extraction.",
"items": {
"$ref": "#/definitions/ConvertibleToString"
}
}
}
}
]
},
"JDBCSchemasV1": {
"type": "object",
"properties": {
"jdbcSchemas": {
"description": "Describe what to fetch from data connection. Scope: Schema and Data extraction.",
"type": "array",
"items": {
"$ref": "#/definitions/JDBCSchemaV1"
}
},
"connectionRef": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Connection used to retrieve at least data from it. If not defined, fallback to application settings. Expected connection name as defined in the connections section of the application.conf file. Scope: Schema and data extraction."
},
"auditConnectionRef": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Connection used to read/store audit from it. If not defined, fallbacks to connectionRef. Expected connection name as defined in the connections section of the application.conf file. Scope: Data extraction."
},
"output": {
"$ref": "#/definitions/OutputV1",
"description": "Override the output format of data extraction. Scope: Data extraction."
},
"default": {
"$ref": "#/definitions/DefaultJDBCSchemaV1",
"description": "Configuration merged into each jdbcSchemas. Scope: Schema and Data extraction."
}
}
},
"InputRefV1": {
"description": "Input for ref object",
"type": "object",
"properties": {
"database": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Database pattern to match, none if any database"
},
"domain": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Domain pattern to match, none if any domain match"
},
"table": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Table pattern to match"
}
},
"required": ["table"]
},
"OutputRefV1": {
"description": "Output for ref object",
"type": "object",
"properties": {
"database": {
"$ref": "#/definitions/ConvertibleToString",
"description": ""
},
"domain": {
"$ref": "#/definitions/ConvertibleToString",
"description": ""
},
"table": {
"$ref": "#/definitions/ConvertibleToString",
"description": ""
}
},
"required": ["table", "domain", "database"]
},
"RefV1": {
"description": "Describe how to resolve a reference in a transform task",
"type": "object",
"properties": {
"input": {
"$ref": "#/definitions/InputRefV1",
"description": "The input table to resolve"
},
"output": {
"$ref": "#/definitions/OutputRefV1",
"description": "The output table resolved with the domain and database"
}
},
"required": ["input", "output"]
},
"KafkaTopicConfigV1": {
"properties": {
"topicName": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
},
"maxRead": {
"type": "integer",
"description": "TODO"
},
"fields": {
"type": "array",
"items": {
"$ref": "#/definitions/ConvertibleToString"
},
"description": "TODO"
},
"partitions": {
"type": "integer",
"description": "TODO"
},
"replicationFactor": {
"type": "integer",
"description": "TODO"
},
"createOptions": {
"$ref": "#/definitions/MapString",
"description": "TODO"
},
"accessOptions": {
"$ref": "#/definitions/MapString",
"description": "TODO"
},
"headers": {
"type": "object",
"additionalProperties": {
"$ref": "#/definitions/MapString"
},
"description": "TODO"
}
}
},
"KafkaConfigV1": {
"type": "object",
"properties": {
"serverOptions": {
"$ref": "#/definitions/MapString",
"description": "TODO"
},
"topics": {
"type": "object",
"description": "Map of topic configs",
"additionalProperties": {
"$ref": "#/definitions/KafkaTopicConfigV1"
}
},
"cometOffsetsMode": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
},
"customDeserializers": {
"$ref": "#/definitions/MapString",
"description": "TODO"
}
}
},
"DagRefV1": {
"type": "object",
"properties": {
"load": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
},
"transform": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
}
}
},
"HttpV1": {
"type": "object",
"properties": {
"interface": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
},
"port": {
"type": "integer",
"description": "TODO"
}
}
},
"AppConfigV1": {
"type": "object",
"properties": {
"env": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Default environment to use. May be also set using the SL_ENV environment variable"
},
"datasets": {
"$ref": "#/definitions/ConvertibleToString",
"description": "When using filesystem storage, default path to store the datasets"
},
"incoming": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Incoming folder to use during autoload"
},
"dags": {
"$ref": "#/definitions/ConvertibleToString",
"description": "DAG generation config folder. metadata/dags by default"
},
"tests": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Path to tests folder. Default is ${metadata}/tests"
},
"writeStrategies": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Location where are located user defined write strategies; Default is ${metadata}/write-strategies"
},
"metadata": {
"$ref": "#/definitions/ConvertibleToString",
"description": "default metadata folder name. May be also set using the SL_METADATA environment variable"
},
"metrics": {
"$ref": "#/definitions/MetricsV1"
},
"validateOnLoad": {
"type": "boolean",
"description": "Validate the YAML file when loading it. If set to true fails on any error"
},
"audit": {
"$ref": "#/definitions/AuditV1"
},
"archive": {
"type": "boolean",
"description": "Should ingested files be archived after ingestion ?"
},
"sinkReplayToFile": {
"type": "boolean",
"description": "Should invalid records be stored in a replay file ?"
},
"lock": {
"$ref": "#/definitions/LockV1"
},
"defaultWriteFormat": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Default write format in Spark. parquet is the default"
},
"defaultRejectedWriteFormat": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Default write format in Spark for rejected records. parquet is the default"
},
"defaultAuditWriteFormat": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Default write format in Spark for audit records. parquet is the default"
},
"csvOutput": {
"type": "boolean",
"description": "output files in CSV format ? Default is false"
},
"csvOutputExt": {
"$ref": "#/definitions/ConvertibleToString",
"description": "CSV file extension when csvOutput is true. Default is .csv"
},
"privacyOnly": {
"type": "boolean",
"description": "Only generate privacy tasks. Reserved for internal use"
},
"emptyIsNull": {
"type": "boolean",
"description": "Should empty strings be considered as null values ?"
},
"loader": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Default loader to use when none is specified in the schema. Valid values are 'spark' or 'native'. Default is 'spark'"
},
"rowValidatorClass": {
"$ref": "#/definitions/ConvertibleToString",
"description": ""
},
"treeValidatorClass": {
"$ref": "#/definitions/ConvertibleToString",
"description": ""
},
"loadStrategyClass": {
"$ref": "#/definitions/ConvertibleToString",
"description": "In what order should the files for a same table be loaded ? By time (default) or by or name ?\n",
"enum": [
"ai.starlake.job.load.IngestionNameStrategy",
"ai.starlake.job.load.IngestionTimeStrategy"
]
},
"grouped": {
"type": "boolean",
"description": "Should we load of the files to be stored in the same table in a single task or one by one ?"
},
"groupedMax": {
"type": "integer",
"description": "Maximum number of files to be stored in the same table in a single task"
},
"scd2StartTimestamp": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
},
"scd2EndTimestamp": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
},
"area": {
"$ref": "#/definitions/AreaV1",
"description": "stage, ingesting ... areas configuration"
},
"hadoop": {
"$ref": "#/definitions/MapString",
"description": "Hadoop configuration if applicable"
},
"connections": {
"$ref": "#/definitions/MapConnectionV1",
"description": "Connections configurations"
},
"jdbcEngines": {
"$ref": "#/definitions/MapJdbcEngineV1",
"description": "JDBC engine configurations"
},
"privacy": {
"$ref": "#/definitions/PrivacyV1",
"description": "Privacy algorithms"
},
"root": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Root folder for the application. May be also set using the SL_ROOT environment variable"
},
"internal": {
"$ref": "#/definitions/InternalV1",
"description": "Internal configuration"
},
"accessPolicies": {
"$ref": "#/definitions/AccessPoliciesV1",
"description": "Access policies configuration"
},
"sparkScheduling": {
"$ref": "#/definitions/SparkSchedulingV1",
"description": "Spark Job scheduling configuration"
},
"udfs": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Coma separated list of UDF to register in Spark jobs. May be also set using the SL_UDFS environment variable"
},
"expectations": {
"$ref": "#/definitions/ExpectationsConfigV1",
"description": "Expectations configuration"
},
"sqlParameterPattern": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Pattern to use to replace parameters in SQL queries in addition to the jinja syntax {{param}}. Default is ${param}"
},
"rejectAllOnError": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Should we reject all records when an error occurs ? Default is false"
},
"rejectMaxRecords": {
"type": "integer",
"description": "Maximum number of records to reject when an error occurs. Default is 100"
},
"maxParCopy": {
"type": "integer",
"description": ""
},
"kafka": {
"$ref": "#/definitions/KafkaConfigV1",
"description": "TODO"
},
"dsvOptions": {
"$ref": "#/definitions/MapString",
"description": "DSV ingestion extra options"
},
"forceViewPattern": {
"$ref": "#/definitions/ConvertibleToString",
"description": "reserved"
},
"forceDomainPattern": {
"$ref": "#/definitions/ConvertibleToString",
"description": "reserved"
},
"forceTablePattern": {
"$ref": "#/definitions/ConvertibleToString",
"description": "reserved"
},
"forceJobPattern": {
"$ref": "#/definitions/ConvertibleToString",
"description": "reserved"
},
"forceTaskPattern": {
"$ref": "#/definitions/ConvertibleToString",
"description": "reserved"
},
"useLocalFileSystem": {
"$ref": "#/definitions/ConvertibleToString",
"description": "reserved"
},
"sessionDurationServe": {
"$ref": "#/definitions/ConvertibleToString",
"description": "reserved"
},
"database": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Default target database (projectId in GCP). May be also set using the SL_DATABASE environment variable"
},
"tenant": {
"$ref": "#/definitions/ConvertibleToString",
"description": "reserved"
},
"connectionRef": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Default connection to use when loading / transforming data"
},
"schedulePresets": {
"$ref": "#/definitions/MapString",
"description": "Default connection to use when loading / transforming data"
},
"maxParTask": {
"type": "integer",
"description": "How many job to run simultaneously in dev mode (experimental)"
},
"refs": {
"type": "array",
"description": "TODO",
"items": {
"$ref": "#/definitions/RefV1"
}
},
"dagRef": {
"$ref": "#/definitions/DagRefV1",
"description": "Default connection to use when loading / transforming data"
},
"forceHalt": {
"type": "boolean",
"description": "Force application to stop even when there is some pending thread."
},
"jobIdEnvName": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
},
"archiveTablePattern": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
},
"archiveTable": {
"type": "boolean",
"description": "TODO"
},
"version": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
},
"autoExportSchema": {
"type": "boolean",
"description": "TODO"
},
"longJobTimeoutMs": {
"type": "integer",
"description": "TODO"
},
"shortJobTimeoutMs": {
"type": "integer",
"description": "TODO"
},
"createSchemaIfNotExists": {
"type": "boolean",
"description": "TODO"
},
"http": {
"$ref": "#/definitions/HttpV1",
"description": "TODO"
},
"timezone": {
"$ref": "#/definitions/ConvertibleToString",
"description": "TODO"
},
"maxInteractiveRecords": {
"type": "integer",
"description": "TODO"
},
"duckdbMode": {
"type": "boolean",
"description": "is duckdb mode active"
},
"testCsvNullString": {
"$ref": "#/definitions/ConvertibleToString",
"description": "null string value in tests"
},
"hiveInTest": {
"$ref": "#/definitions/ConvertibleToString",
"description": "Internal use only"
},
"spark": {
"type": "object",
"description": "Map of string",
"additionalProperties": true
},
"extra": {
"type": "object",
"description": "Map of string",
"additionalProperties": true
}
}
},
"StarlakeV1Base": {
"type": "object",
"properties": {
"types": {
"type": "array",
"items": {
"$ref": "#/definitions/TypeV1"
}
},
"dag": {
"$ref": "#/definitions/DagGenerationConfigV1"
},
"extract": {
"$ref": "#/definitions/JDBCSchemasV1"
},
"load": {
"$ref": "#/definitions/DomainV1"
},
"transform": {
"$ref": "#/definitions/AutoJobDescV1"
},
"task": {
"$ref": "#/definitions/AutoTaskDescV1"
},
"env": {
"$ref": "#/definitions/MapString"
},
"table": {
"$ref": "#/definitions/TableV1"
},
"refs": {
"type": "array",
"items": {
"$ref": "#/definitions/RefV1"
}
},
"application": {
"$ref": "#/definitions/AppConfigV1"
}
},
"oneOf": [
{
"required": ["extract"]
},
{
"required": ["load"]
},
{
"required": ["transform"]
},
{
"required": ["env"]
},
{
"required": ["types"]
},
{
"required": ["tables"]
},
{
"required": ["table"]
},
{
"required": ["task"]
},
{
"required": ["application"]
},
{
"required": ["refs"]
},
{
"required": ["dag"]
}
]
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy