schema.comet.json Maven / Gradle / Ivy

Go to download
{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "Starlake Data Pipeline",
  "description": "JSON Schema for Starlake Data Pipeline",
  "definitions": {
    "PrimitiveType": {
      "type": "string",
      "oneOf": [
        {
          "const": "string",
          "description": ""
        },
        {
          "const": "long",
          "description": ""
        },
        {
          "const": "int",
          "description": ""
        },
        {
          "const": "short",
          "description": ""
        },
        {
          "const": "double",
          "description": ""
        },
        {
          "const": "boolean",
          "description": ""
        },
        {
          "const": "byte",
          "description": ""
        },
        {
          "const": "date",
          "description": ""
        },
        {
          "const": "timestamp",
          "description": ""
        },
        {
          "const": "decimal",
          "description": ""
        },
        {
          "const": "struct",
          "description": ""
        }
      ]
    },
    "IndexMapping": {
      "type": "string",
      "oneOf": [
        {
          "const": "text",
          "description": ""
        },
        {
          "const": "keyword",
          "description": ""
        },
        {
          "const": "long",
          "description": ""
        },
        {
          "const": "integer",
          "description": ""
        },
        {
          "const": "short",
          "description": ""
        },
        {
          "const": "byte",
          "description": ""
        },
        {
          "const": "double",
          "description": ""
        },
        {
          "const": "float",
          "description": ""
        },
        {
          "const": "half_float",
          "description": ""
        },
        {
          "const": "scaled_float",
          "description": ""
        },
        {
          "const": "date",
          "description": ""
        },
        {
          "const": "boolean",
          "description": ""
        },
        {
          "const": "binary",
          "description": ""
        },
        {
          "const": "integer_rang",
          "description": ""
        },
        {
          "const": "float_range",
          "description": ""
        },
        {
          "const": "long_range",
          "description": ""
        },
        {
          "const": "double_range",
          "description": ""
        },
        {
          "const": "date_range",
          "description": ""
        },
        {
          "const": "geo_point",
          "description": ""
        },
        {
          "const": "geo_shape",
          "description": ""
        },
        {
          "const": "ip",
          "description": ""
        },
        {
          "const": "completion",
          "description": ""
        },
        {
          "const": "token_count",
          "description": ""
        },
        {
          "const": "object",
          "description": ""
        },
        {
          "const": "array",
          "description": ""
        }
      ]
    },
    "Engine": {
      "description": "SPARK or BQ. Default value is SPARK.",
      "type": "string",
      "oneOf": [
        {
          "const": "BQ",
          "description": ""
        },
        {
          "const": "SPARK",
          "description": ""
        }
      ]
    },
    "WriteMode": {
      "description": "Append to or overwrite existing data",
      "type": "string",
      "oneOf": [
        {
          "const": "OVERWRITE",
          "description": ""
        },
        {
          "const": "APPEND",
          "description": ""
        },
        {
          "const": "ERROR_IF_EXISTS",
          "description": ""
        },
        {
          "const": "IGNORE",
          "description": ""
        }
      ]
    },
    "UserType": {
      "type": "string",
      "oneOf": [
        {
          "const": "SA",
          "description": ""
        },
        {
          "const": "USER",
          "description": ""
        },
        {
          "const": "GROUP",
          "description": ""
        }
      ]
    },
    "Trim": {
      "type": "string",
      "oneOf": [
        {
          "const": "LEFT",
          "description": ""
        },
        {
          "const": "RIGHT",
          "description": ""
        },
        {
          "const": "BOTH",
          "description": ""
        },
        {
          "const": "NONE",
          "description": ""
        }
      ]
    },
    "SinkType": {
      "description": "Where to sink the data",
      "type": "string",
      "oneOf": [
        {
          "const": "NONE",
          "description": ""
        },
        {
          "const": "JBDC",
          "description": ""
        },
        {
          "const": "BQ",
          "description": ""
        },
        {
          "const": "ES",
          "description": ""
        },
        {
          "const": "FS",
          "description": ""
        }
      ]
    },
    "Mode": {
      "description": "FILE mode by default.\\nFILE and STREAM are the two accepted values.\\nFILE is currently the only supported mode.",
      "type": "string",
      "oneOf": [
        {
          "const": "FILE",
          "description": ""
        },
        {
          "const": "STREAM",
          "description": ""
        },
        {
          "const": "FILE_AND_STREAM",
          "description": ""
        }
      ]
    },
    "Type": {
      "type": "object",
      "properties": {
        "name": {
          "type": "string"
        },
        "pattern": {
          "type": "string"
        },
        "zone": {
          "type": "string",
          "description": "Useful for timestamp / dates"
        },
        "sample": {
          "type": "string"
        },
        "comment": {
          "type": "string"
        },
        "indexMapping": {
          "type": "string"
        }
      },
      "required": [
        "name",
        "pattern"
      ]
    },
    "Partition": {
      "description": "Partition columns, no partitioning by default",
      "type": "object",
      "properties": {
        "sampling": {
          "type": "number"
        },
        "attributes": {
          "type": "array",
          "items": {
            "type": "string"
          }
        }
      },
      "required": []
    },
    "Position": {
      "type": "object",
      "properties": {
        "first": {
          "type": "number"
        },
        "last": {
          "type": "number"
        }
      },
      "required": [
        "first",
        "last"
      ]
    },
    "RowLevelSecurity": {
      "description": "Row level security policy to apply too the output data.",
      "type": "object",
      "properties": {
        "name": {
          "type": "string",
          "description": "This Row Level Security unique name"
        },
        "predicate": {
          "type": "string",
          "description": "The condition that goes to the WHERE clause and limitt the visible rows."
        },
        "grants": {
          "description": "user / groups / service accounts to which this security level is applied.\nex : user:[email protected],group:[email protected],serviceAccount:[email protected]",
          "type": "array",
          "items": {
            "type": "string"
          }
        }
      }
    },
    "MergeOptions": {
      "type": "object",
      "properties": {
        "key": {
          "description": "list of attributes to join existing with incoming dataset. Use renamed columns here.",
          "type": "array",
          "items": {
            "type": "string"
          }
        },
        "delete": {
          "type": "string",
          "description": "Optional valid sql condition on the incoming dataset. Use renamed column here."
        },
        "timestamp": {
          "type": "string",
          "description": "Timestamp column used to identify last version, if not specified currently ingested row is considered the last"
        },
        "queryFilter": {
          "type": "string"
        }
      },
      "required": [
        "key"
      ]
    },
    "Format": {
      "description": "DSV by default. Supported file formats are :\\n- DSV : Delimiter-separated values file. Delimiter value iss specified in the \"separator\" field.\\n- POSITION : FIXED format file where values are located at an exact position in each line.\\n- JSON_FLAT : For optimisation purpose, we differentiate JSON with top level values from JSON\\n  with deep level fields. JSON_FLAT are JSON files with top level fields only.\\n- JSON :  Deep JSON file. Use only when your json documents contain subdocuments, otherwise prefer to\\n  use JSON_FLAT since it is much faster.\\n- XML : XML files",
      "type": "string",
      "oneOf": [
        {
          "const": "DSV",
          "description": ""
        },
        {
          "const": "POSITION",
          "description": ""
        },
        {
          "const": "JSON",
          "description": ""
        },
        {
          "const": "JSON_ARRAY",
          "description": ""
        },
        {
          "const": "JSON_FLAT",
          "description": "Simple Json is made of a single level attributes of simple types (no arrray or map or sub objects)"
        },
        {
          "const": "XML",
          "description": ""
        }
      ]
    },
    "MapString": {
      "type": "object",
      "additionalProperties": {
        "type": "string"
      }
    },
    "Sink": {
      "type": "object",
      "properties": {
        "type": {
          "$ref": "#/definitions/SinkType"
        },
        "name": {
          "type": "string"
        },
        "id": {
          "type": "string"
        },
        "timestamp": {
          "type": "string"
        },
        "location": {
          "type": "string"
        },
        "clustering": {
          "type": "string"
        },
        "days": {
          "type": "number"
        },
        "requirePartitionFilter": {
          "type": "boolean"
        },
        "connection": {
          "type": "string"
        },
        "partitions": {
          "type": "number"
        },
        "batchSize": {
          "type": "number"
        }
      },
      "required": [
        "type"
      ]

    },
    "Metadata": {
      "type": "object",
      "properties": {
        "partition": {
          "$ref": "#/definitions/Partition"
        },
        "mode": {
          "$ref": "#/definitions/Mode"
        },
        "format": {
          "$ref": "#/definitions/Format"
        },
        "encoding": {
          "type": "string",
          "description": "UTF-8 if not specified."
        },
        "multiline": {
          "type": "boolean",
          "description": "are json objects on a single line or multiple line ? Single by default.  false means single. false also means faster"
        },
        "array": {
          "type": "boolean",
          "description": "Is the json stored as a single object array ? false by default. This means that by default we have on json document per line."
        },
        "withHeader": {
          "type": "boolean",
          "description": "does the dataset has a header ? true bu default"
        },
        "separator": {
          "type": "string",
          "description": "the values delimiter,  ';' by default value may be a multichar string starting from Spark3"
        },
        "quote": {
          "type": "string",
          "description": "The String quote char, '\"' by default"
        },
        "escape": {
          "type": "string",
          "description": "escaping char '\\' by default"
        },
        "write": {
          "$ref": "#/definitions/WriteMode",
          "description": "Write mode, APPEND by default"
        },
        "sink": {
          "$ref": "#/definitions/Sink"
        },
        "clustering": {
          "description": "List of attributes to use for clustering",
          "type": "array",
          "items": {
            "type": "string"
          }
        },
        "ignore": {
          "type": "string",
          "description": "Pattern to ignore or UDF to apply to ignore some lines"
        },
        "xml": {
          "$ref": "#/definitions/MapString",
          "description": "com.databricks.spark.xml options to use (eq. rowTag)"
        }
      }
    },
    "Schema": {
      "type": "object",
      "properties": {
        "name": {
          "type": "string",
          "description": "Schema name, must be unique among all the schemas belonging to the same domain.\n  *                     Will become the hive table name On Premise or BigQuery Table name on GCP."
        },
        "pattern": {
          "description": "filename pattern to which this schema must be applied.\n  *                     This instructs the framework to use this schema to parse any file with a filename that match this pattern.",
          "type": "string"
        },
        "attributes": {
          "description": "Attributes parsing rules.",
          "type": "array",
          "items": {
            "$ref": "#/definitions/Attribute"
          }
        },
        "metadata": {
          "$ref": "#/definitions/Metadata",
          "description": "Dataset metadata"
        },
        "merge": {
          "$ref": "#/definitions/MergeOptions"
        },
        "comment": {
          "type": "string",
          "description": "free text"
        },
        "presql": {
          "type": "array",
          "description": "Reserved for future use.",
          "items": {
            "type": "string"
          }
        },
        "postsql": {
          "type": "array",
          "description": "Reserved for future use.",
          "items": {
            "type": "string"
          }
        },
        "tags": {
          "description": "Set of string to attach to this Schema",
          "type": "array",
          "items": {
            "type": "string"
          }
        },
        "rls": {
          "description": " Experimental. Row level security to this to this schema.",
          "type": "array",
          "items": {
            "$ref": "#/definitions/RowLevelSecurity"
          }
        },
        "assertions": {
          "$ref": "#/definitions/MapString",
          "description": "Assertions to check after Load / Transform has succeeded"
        }
      }
    },
    "Attribute": {
      "type": "object",
      "properties": {
        "name": {
          "type": "string",
          "description": "Attribute name as defined in the source dataset and as received in the file"
        },
        "type": {
          "type": "string",
          "description": "Is it an array ?"
        },
        "array": {
          "type": "boolean",
          "description": "semantic type of the attribute"
        },
        "required": {
          "type": "boolean",
          "description": "Should this attribute always be present in the source"
        },
        "privacy": {
          "type": "string",
          "description": "Should this attribute be applied a privacy transformation at ingestion time"
        },
        "comment": {
          "type": "string",
          "description": "free text for attribute description"
        },
        "rename": {
          "type": "string",
          "description": "If present, the attribute is renamed with this name"
        },
        "metricType": {
          "type": "string",
          "description": "If present, what kind of stat should be computed for this field"
        },
        "attributes": {
          "type": "array",
          "description": "List of sub-attributes (valid for JSON and XML files only)",
          "items": {
            "$ref": "#/definitions/Attribute"
          }
        },
        "position": {
          "$ref": "#/definitions/Position"
        },
        "default": {
          "type": "string",
          "description": "Default value for this attribute when it is not present."
        },
        "tags": {
          "type": "array",
          "description": "Tags associated with this attribute",
          "items": {
            "type": "string"
          }
        },
        "trim": {
          "$ref": "#/definitions/Trim"
        },
        "script": {
          "type": "string",
          "description": "Scripted field : SQL request on renamed column"
        }
      },
      "required": [
        "name",
        "type"
      ]
    },
    "AutoTaskDesc": {
      "type": "object",
      "properties": {
        "sql": {
          "type": "string",
          "description": "Main SQL request to exexute (do not forget to prefix table names with the database name to avoid conflicts)"
        },
        "domain": {
          "type": "string",
          "description": "Output domain in output Area (Will be the Database name in Hive or Dataset in BigQuery)"
        },
        "dataset": {
          "type": "string",
          "description": "Dataset Name in output Area (Will be the Table name in Hive & BigQuery)"
        },
        "write": {
          "$ref": "#/definitions/WriteMode"
        },
        "area": {
          "type": "string",
          "description": "Target Area where domain / dataset will be stored."
        },
        "partition": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "List of columns used for partitioning the outtput."
        },
        "presql": {
          "type": "string",
          "description": "List of SQL requests to executed before the main SQL request is run"
        },
        "postsql": {
          "type": "string",
          "description": "List of SQL requests to executed after the main SQL request is run"
        },
        "sink": {
          "$ref": "#/definitions/Sink"
        },
        "rls": {
          "type": "array",
          "items": {
            "$ref": "#/definitions/RowLevelSecurity"
          }
        },
        "assertions": {
          "$ref": "#/definitions/MapString",
          "description": "Assertions to check after Load / Transform has succeeded"
        }
      },
      "required": [
        "name",
        "domain",
        "dataset",
        "write"
      ]
    },
    "Domain": {
      "type": "object",
      "properties": {
        "name": {
          "type": "string",
          "description": "Domain name. Make sure you use a name that may be used as a folder name on the target storage.\n                   - When using HDFS or Cloud Storage,  files once ingested are stored in a sub-directory named after the domain name.\n                   - When used with BigQuery, files are ingested and sorted in tables under a dataset named after the domain name."
        },
        "directory": {
          "description": "Folder on the local filesystem where incoming files are stored.\n                     Typically, this folder will be scanned periodically to move the dataset to the cluster for ingestion.\n                     Files located in this folder are moved to the pending folder for ingestion by the \"import\" command.",
          "type": "string"
        },
        "metadata": {
          "$ref": "#/definitions/Metadata"
        },
        "schemas": {
          "type": "array",
          "description": "List of schemas for each dataset in this domain.\nA domain ususally contains multiple schemas. Each schema defining how the contents of the input file should be parsed.\nSee Schema for more details.",
          "items": {
            "$ref": "#/definitions/Schema"
          }
        },
        "comment": {
          "description": "Domain Description (free text)",
          "type": "string"
        },
        "extensions": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "recognized filename extensions. json, csv, dsv, psv are recognized by default.\nOnly files with these extensions will be moved to the pending folder."
        },
        "ack": {
          "description": "Ack extension used for each file. \".ack\" if not specified.\nFiles are moved to the pending folder only once a file with the same name as the source file and with this extension is present.\nTo move a file without requiring an ack file to be present, set explicitly this property to the empty string value \"\".",
          "type": "string"
        }
      },
      "required": [
        "name",
        "directory",
        "schemas"
      ]
    },
    "AutoJobDesc": {
      "type": "object",
      "properties": {
        "name": {
          "type": "string",
          "description": "Job logical name"
        },
        "area": {
          "type": "string",
          "description": "Area where the data is located.\\nWhen using the BigQuery engine, teh area corresponds to the dataset name we will be working on in this job.\\nWhen using the Spark engine, this is folder where the data should be store. Default value is \"business\""
        },
        "format": {
          "type": "string",
          "description": "output file format when using Spark engine. Ingored for BigQuery. Default value is \"parquet\""
        },
        "coalesce": {
          "type": "boolean",
          "description": "When outputting files, should we coalesce it to a single file. Useful when CSV is the output format."
        },
        "tasks": {
          "type": "array",
          "items": {
            "$ref": "#/definitions/AutoTaskDesc",
            "description": "List of transform tasks to execute"
          }
        },
        "udf": {
          "type": "string",
          "description": "Register UDFs written in this JVM class when using Spark engine.\\nRegister UDFs stored at this location when using BigQuery engine"
        },
        "views": {
          "$ref": "#/definitions/MapString",
          "description": "Create temporary views using where the key is the view name and the map the SQL request corresponding to this view using the SQL engine supported syntax."
        },
        "engine": {
          "$ref": "#/definitions/Engine"
        }
      },
      "required": [
        "name"
      ]
    }
  },
  "type": "object",
  "properties": {
    "types": {
      "type": "array",
      "items": {
        "$ref": "#/definitions/Type"
      }
    },
    "env": {
      "$ref": "#/definitions/MapString"
    },
    "transform": {
      "$ref": "#/definitions/AutoJobDesc"
    },
    "load": {
      "$ref": "#/definitions/Domain"
    },
    "assertions": {
      "$ref": "#/definitions/MapString",
      "description": "Assertions library defined as a map name(params) -> sql request that should return 0 record"
    },
    "views": {
      "$ref": "#/definitions/MapString"
    },
    "name": {
      "type": "string",
      "description": "Domain name. Make sure you use a name that may be used as a folder name on the target storage.\n                   - When using HDFS or Cloud Storage,  files once ingested are stored in a sub-directory named after the domain name.\n                   - When used with BigQuery, files are ingested and sorted in tables under a dataset named after the domain name."
    },
    "directory": {
      "description": "Folder on the local filesystem where incoming files are stored.\n                     Typically, this folder will be scanned periodically to move the dataset to the cluster for ingestion.\n                     Files located in this folder are moved to the pending folder for ingestion by the \"import\" command.",
      "type": "string"
    },
    "metadata": {
      "$ref": "#/definitions/Metadata"
    },
    "schemas": {
      "type": "array",
      "description": "List of schemas for each dataset in this domain.\nA domain ususally contains multiple schemas. Each schema defining how the contents of the input file should be parsed.\nSee Schema for more details.",
      "items": {
        "$ref": "#/definitions/Schema"
      }
    },
    "comment": {
      "description": "Domain Description (free text)",
      "type": "string"
    },
    "extensions": {
      "type": "array",
      "items": {
        "type": "string"
      },
      "description": "recognized filename extensions. json, csv, dsv, psv are recognized by default.\nOnly files with these extensions will be moved to the pending folder."
    },
    "ack": {
      "description": "Ack extension used for each file. \".ack\" if not specified.\nFiles are moved to the pending folder only once a file with the same name as the source file and with this extension is present.\nTo move a file without requiring an ack file to be present, set explicitly this property to the empty string value \"\".",
      "type": "string"
    }
  },
  "oneOf": [
    {
      "required": [
        "load"
      ]
    },
    {
      "required": [
        "views"
      ]
    },
    {
      "required": [
        "assertions"
      ]
    },
    {
      "required": [
        "env"
      ]
    },
    {
      "required": [
        "types"
      ]
    },
    {
      "required": [
        "transform"
      ]
    },
    {
      "required": [
        "name",
        "directory",
        "schemas"
      ]
    }
  ]
}