All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.scio.extra.bigquery.ToTableSchema.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2019 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.scio.extra.bigquery

import com.spotify.scio.extra.bigquery.AvroConverters.AvroConversionException

import com.google.api.services.bigquery.model.TableFieldSchema
import org.apache.avro.LogicalTypes._
import org.apache.avro.Schema.Type
import org.apache.avro.Schema.Type._
import org.apache.avro.{LogicalType, Schema}

import scala.jdk.CollectionConverters._

/**
 * Converts a [[org.apache.avro.Schema Schema]] object into a
 * [[com.google.api.services.bigquery.model.TableSchema]] TableSchema. All Avro primitive and
 * complex types are supported.
 */
private[bigquery] trait ToTableSchema {
  private lazy val avroToBQTypes: Map[Type, String] = Map(
    STRING -> "STRING",
    ENUM -> "STRING",
    BYTES -> "BYTES",
    INT -> "INTEGER",
    LONG -> "INTEGER",
    FLOAT -> "FLOAT",
    DOUBLE -> "FLOAT",
    BOOLEAN -> "BOOLEAN",
    RECORD -> "RECORD",
    FIXED -> "BYTES"
  )

  private lazy val supportedAvroTypes: Set[Type] =
    (avroToBQTypes.keys ++ Seq(UNION, ARRAY, RECORD, MAP)).toSet

  private[bigquery] def getFieldSchemas(avroSchema: Schema): List[TableFieldSchema] =
    avroSchema.getFields.asScala.map { field =>
      val tableField = new TableFieldSchema()
        .setName(field.name)

      Option(field.doc).foreach(tableField.setDescription)

      setFieldType(tableField, field.schema)
      tableField
    }.toList

  private def setFieldType(field: TableFieldSchema, schema: Schema): Unit = {
    val schemaType = schema.getType

    if (!supportedAvroTypes.contains(schemaType)) {
      throw AvroConversionException(s"Could not match type $schemaType")
    }

    if (schemaType != UNION && Option(field.getMode).isEmpty) {
      field.setMode("REQUIRED")
    }

    Option(schema.getLogicalType)
      .map(typeFromLogicalType)
      .orElse(avroToBQTypes.get(schemaType))
      .foreach(field.setType)

    schemaType match {
      case UNION =>
        setFieldDataTypeFromUnion(field, schema)
      case ARRAY =>
        setFieldDataTypeFromArray(field, schema)
      case RECORD =>
        field.setFields(getFieldSchemas(schema).asJava)
        ()
      case MAP =>
        setFieldTypeFromMap(field, schema)
      case _ =>
        ()
    }
  }

  private def setFieldDataTypeFromUnion(field: TableFieldSchema, schema: Schema): Unit = {
    if (schema.getTypes.size != 2) {
      throw AvroConversionException("Union fields with > 2 types not supported")
    }

    if (Option(field.getMode).contains("REPEATED")) {
      throw AvroConversionException("Array of unions is not supported")
    }

    if (schema.getTypes.asScala.count(_.getType == NULL) != 1) {
      throw AvroConversionException("Union field must include null type")
    }

    field.setMode("NULLABLE")

    schema.getTypes.asScala
      .find(_.getType != NULL)
      .foreach(fieldType => setFieldType(field, fieldType))

    ()
  }

  private def setFieldDataTypeFromArray(field: TableFieldSchema, schema: Schema): Unit = {
    if (field.getMode.equals("REPEATED")) {
      throw AvroConversionException("Array of arrays not supported")
    }

    field.setMode("REPEATED")
    setFieldType(field, schema.getElementType)
  }

  private def setFieldTypeFromMap(field: TableFieldSchema, schema: Schema): Unit = {
    if (field.getMode.equals("REPEATED")) {
      throw AvroConversionException("Array of maps not supported")
    }

    field.setMode("REPEATED")
    field.setType("RECORD")

    val keyField = new TableFieldSchema()
      .setName("key")
      .setType("STRING")
      .setMode("REQUIRED")
    val valueField = new TableFieldSchema().setName("value")
    setFieldType(valueField, schema.getValueType)

    field.setFields(List(keyField, valueField).asJava)
    ()
  }

  /**
   * This uses avro logical type to Converted BigQuery mapping in the following table
   * https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro#logical_types Joda time
   * library doesn't support microsecond level precision, therefore time-micros map to 'INTEGER'
   * instead of 'TIME', for the same reason timestamp-micros map to 'INTEGER' instead of 'TIMESTAMP'
   */
  private def typeFromLogicalType(logicalType: LogicalType): String = logicalType match {
    case _: Date            => "DATE"
    case _: TimeMillis      => "TIME"
    case _: TimeMicros      => "INTEGER"
    case _: TimestampMillis => "TIMESTAMP"
    case _: TimestampMicros => "INTEGER"
    case _: Decimal         => "NUMERIC"
    case _ => throw new IllegalStateException(s"Unknown Logical Type: [${logicalType.getName}]")
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy