Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
*
* * Licensed to the Apache Software Foundation (ASF) under one or more
* * contributor license agreements. See the NOTICE file distributed with
* * this work for additional information regarding copyright ownership.
* * The ASF licenses this file to You under the Apache License, Version 2.0
* * (the "License"); you may not use this file except in compliance with
* * the License. You may obtain a copy of the License at
* *
* * http://www.apache.org/licenses/LICENSE-2.0
* *
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS,
* * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* * See the License for the specific language governing permissions and
* * limitations under the License.
*
*
*/
package com.ebiznext.comet.schema.model
import java.util.regex.Pattern
import com.ebiznext.comet.utils.conversion.BigQueryUtils._
import com.ebiznext.comet.schema.handlers.SchemaHandler
import com.ebiznext.comet.utils.TextSubstitutionEngine
import com.fasterxml.jackson.annotation.JsonIgnore
import com.google.cloud.bigquery.Field
import org.apache.spark.sql.types._
import scala.collection.mutable
/** How dataset are merged
*
* @param key list of attributes to join existing with incoming dataset. Use renamed columns here.
* @param delete Optional valid sql condition on the incoming dataset. Use renamed column here.
* @param timestamp Timestamp column used to identify last version, if not specified currently ingested row is considered the last
*/
case class MergeOptions(
key: List[String],
delete: Option[String] = None,
timestamp: Option[String] = None
)
/** Dataset Schema
*
* @param name : Schema name, must be unique among all the schemas belonging to the same domain.
* Will become the hive table name On Premise or BigQuery Table name on GCP.
* @param pattern : filename pattern to which this schema must be applied.
* This instructs the framework to use this schema to parse any file with a filename that match this pattern.
* @param attributes : Attributes parsing rules.
* See :ref:`attribute_concept`
* @param metadata : Dataset metadata
* See :ref:`metadata_concept`
* @param comment : free text
* @param presql : Reserved for future use.
* @param postsql : Reserved for future use.
* @param tags : Set of string to attach to this Schema
* @param rls : Experimental. Row level security to this to this schema.
* See :ref:`rowlevelsecurity_concept`
*/
case class Schema(
name: String,
pattern: Pattern,
attributes: List[Attribute],
metadata: Option[Metadata],
merge: Option[MergeOptions],
comment: Option[String],
presql: Option[List[String]],
postsql: Option[List[String]],
tags: Option[Set[String]] = None,
rls: Option[List[RowLevelSecurity]] = None
) {
@JsonIgnore
lazy val attributesWithoutScript: List[Attribute] = attributes.filter(_.script.isEmpty)
/** @return Are the parittions columns defined in the metadata valid column names
*/
def validatePartitionColumns(): Boolean = {
metadata.forall(
_.getPartitionAttributes().forall(
attributes
.map(_.getFinalName())
.union(Metadata.CometPartitionColumns)
.contains
)
)
}
/** This Schema as a Spark Catalyst Schema
*
* @return Spark Catalyst Schema
*/
def sparkType(schemaHandler: SchemaHandler): StructType = {
val fields = attributes.map { attr =>
StructField(attr.name, attr.sparkType(schemaHandler), !attr.required)
}
StructType(fields)
}
/** This Schema as a Spark Catalyst Schema, with renamed attributes
*
* @return Spark Catalyst Schema
*/
def sparkTypeWithRenamedFields(schemaHandler: SchemaHandler): StructType =
sparkSchemaWithCondition(schemaHandler, _ => true)
/** This Schema as a Spark Catalyst Schema, without scripted fields
*
* @return Spark Catalyst Schema
*/
def sparkSchemaWithoutScriptedFields(schemaHandler: SchemaHandler): StructType =
sparkSchemaWithCondition(schemaHandler, _.script.isEmpty)
private def sparkSchemaWithCondition(
schemaHandler: SchemaHandler,
p: Attribute => Boolean
): StructType = {
val fields = attributes filter p map { attr =>
StructField(attr.rename.getOrElse(attr.name), attr.sparkType(schemaHandler), !attr.required)
}
StructType(fields)
}
import com.google.cloud.bigquery.{Schema => BQSchema}
def bqSchema(schemaHandler: SchemaHandler): BQSchema = {
val fields = attributes map { attribute =>
Field
.newBuilder(
attribute.rename.getOrElse(attribute.name),
convert(attribute.sparkType(schemaHandler))
)
.setMode(if (attribute.required) Field.Mode.REQUIRED else Field.Mode.NULLABLE)
.setDescription(attribute.comment.getOrElse(""))
.build()
}
BQSchema.of(fields: _*)
}
/** return the list of renamed attributes
*
* @return list of tuples (oldname, newname)
*/
def renamedAttributes(): List[(String, String)] = {
attributes.filter(attr => attr.name != attr.getFinalName()).map { attr =>
(attr.name, attr.getFinalName())
}
}
/** Check attribute definition correctness :
* - schema name should be a valid table identifier
* - attribute name should be a valid Hive column identifier
* - attribute name can occur only once in the schema
*
* @return error list or true
*/
def checkValidity(
domainMetaData: Option[Metadata],
schemaHandler: SchemaHandler
): Either[List[String], Boolean] = {
val errorList: mutable.MutableList[String] = mutable.MutableList.empty
val tableNamePattern = Pattern.compile("[a-zA-Z][a-zA-Z0-9_]{1,256}")
if (!tableNamePattern.matcher(name).matches())
errorList += s"Schema with name $name should respect the pattern ${tableNamePattern.pattern()}"
metadata.foreach { metadata =>
for (errors <- metadata.checkValidity(schemaHandler).left) {
errorList ++= errors
}
}
attributes.foreach { attribute =>
for (errors <- attribute.checkValidity(schemaHandler).left) {
errorList ++= errors
}
}
val duplicateErrorMessage =
"%s is defined %d times. An attribute can only be defined once."
for (errors <- duplicates(attributes.map(_.name), duplicateErrorMessage).left) {
errorList ++= errors
}
if (errorList.nonEmpty)
Left(errorList.toList)
else
Right(true)
}
def discreteAttrs(schemaHandler: SchemaHandler): List[Attribute] =
attributes.filter(_.getMetricType(schemaHandler) == MetricType.DISCRETE)
def continuousAttrs(schemaHandler: SchemaHandler): List[Attribute] =
attributes.filter(_.getMetricType(schemaHandler) == MetricType.CONTINUOUS)
def mapping(
template: Option[String],
domainName: String,
schemaHandler: SchemaHandler
): String = {
val attrs = attributes.map(_.mapping(schemaHandler)).mkString(",")
val properties =
s"""
|"properties": {
|$attrs
|}""".stripMargin
val tse = TextSubstitutionEngine(
"PROPERTIES" -> properties,
"ATTRIBUTES" -> attrs,
"DOMAIN" -> domainName.toLowerCase,
"SCHEMA" -> name.toLowerCase
)
tse.apply(template.getOrElse {
s"""
|{
| "index_patterns": ["__DOMAIN__.__SCHEMA__", "__DOMAIN__.__SCHEMA__-*"],
| "settings": {
| "number_of_shards": "1",
| "number_of_replicas": "0"
| },
| "mappings": {
| "_doc": {
| "_source": {
| "enabled": true
| },
|
|"properties": {
|__ATTRIBUTES__
|}
| }
| }
|}""".stripMargin
})
}
def mergedMetadata(domainMetadata: Option[Metadata]): Metadata = {
domainMetadata
.getOrElse(Metadata())
.`import`(this.metadata.getOrElse(Metadata()))
}
}
object Schema {
def mapping(
domainName: String,
schemaName: String,
obj: StructField,
schemaHandler: SchemaHandler
): String = {
def buildAttributeTree(obj: StructField): Attribute = {
obj.dataType match {
case StringType | LongType | IntegerType | ShortType | DoubleType | BooleanType | ByteType |
DateType | TimestampType =>
Attribute(obj.name, obj.dataType.typeName, required = !obj.nullable)
case d: DecimalType =>
Attribute(obj.name, "decimal", required = !obj.nullable)
case ArrayType(eltType, containsNull) => buildAttributeTree(obj.copy(dataType = eltType))
case x: StructType =>
new Attribute(
obj.name,
"struct",
required = !obj.nullable,
attributes = Some(x.fields.map(buildAttributeTree).toList)
)
case _ => throw new Exception(s"Unsupported Date type ${obj.dataType} for object $obj ")
}
}
Schema(
schemaName,
Pattern.compile("ignore"),
buildAttributeTree(obj).attributes.getOrElse(Nil),
None,
None,
None,
None,
None
).mapping(None, domainName, schemaHandler)
}
}