org.apache.spark.sql.protobuf.functions.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of databricks-connect Show documentation
Show all versions of databricks-connect Show documentation
Develop locally and connect IDEs, notebook servers and running applications to Databricks clusters.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.protobuf
import java.io.FileNotFoundException
import java.nio.file.{Files, NoSuchFileException, Paths}
import java.util.Collections
import scala.jdk.CollectionConverters._
import scala.util.control.NonFatal
import org.apache.spark.annotation.Experimental
import org.apache.spark.sql.Column
import org.apache.spark.sql.errors.CompilationErrors
import org.apache.spark.sql.functions.{fnWithOptions, lit}
// scalastyle:off: object.name
object functions {
// scalastyle:on: object.name
/**
* Converts a binary column of Protobuf format into its corresponding catalyst value. The
* Protobuf definition is provided through Protobuf descriptor file.
*
* @param data
* the binary column.
* @param messageName
* the protobuf message name to look for in descriptor file.
* @param descFilePath
* The Protobuf descriptor file. This file is usually created using `protoc` with
* `--descriptor_set_out` and `--include_imports` options.
* @param options
* @since 3.5.0
*/
@Experimental
def from_protobuf(
data: Column,
messageName: String,
descFilePath: String,
options: java.util.Map[String, String]): Column = {
val binaryFileDescSet = readDescriptorFileContent(descFilePath)
from_protobuf(data, messageName, binaryFileDescSet, options)
}
/**
* Converts a binary column of Protobuf format into its corresponding catalyst value.The
* Protobuf definition is provided through Protobuf `FileDescriptorSet`.
*
* @param data
* the binary column.
* @param messageName
* the protobuf MessageName to look for in the descriptor set.
* @param binaryFileDescriptorSet
* Serialized Protobuf descriptor (`FileDescriptorSet`). Typically contents of file created
* using `protoc` with `--descriptor_set_out` and `--include_imports` options.
* @param options
* @since 3.5.0
*/
@Experimental
def from_protobuf(
data: Column,
messageName: String,
binaryFileDescriptorSet: Array[Byte],
options: java.util.Map[String, String]): Column = {
fnWithOptions(
"from_protobuf",
options.asScala.iterator,
data,
lit(messageName),
lit(binaryFileDescriptorSet))
}
/**
* Converts a binary column of Protobuf format into its corresponding catalyst value. The
* Protobuf definition is provided through Protobuf descriptor file.
*
* @param data
* the binary column.
* @param messageName
* the protobuf MessageName to look for in descriptor file.
* @param descFilePath
* The Protobuf descriptor file. This file is usually created using `protoc` with
* `--descriptor_set_out` and `--include_imports` options.
* @since 3.5.0
*/
@Experimental
def from_protobuf(data: Column, messageName: String, descFilePath: String): Column = {
from_protobuf(data, messageName, descFilePath, emptyOptions)
}
/**
* Converts a binary column of Protobuf format into its corresponding catalyst value.
* `messageClassName` points to Protobuf Java class. The jar containing Java class should be
* shaded. Specifically, `com.google.protobuf.*` should be shaded to
* `org.sparkproject.spark_protobuf.protobuf.*`.
* https://github.com/rangadi/shaded-protobuf-classes is useful to create shaded jar from
* Protobuf files.
*
* @param data
* the binary column.
* @param messageClassName
* The full name for Protobuf Java class. E.g. com.example.protos.ExampleEvent
.
* The jar with these classes needs to be shaded as described above.
* @since 3.5.0
*/
@Experimental
def from_protobuf(data: Column, messageClassName: String): Column = {
Column.fn("from_protobuf", data, lit(messageClassName))
}
/**
* Converts a binary column of Protobuf format into its corresponding catalyst value.The
* Protobuf definition is provided through Protobuf `FileDescriptorSet`.
*
* @param data
* the binary column.
* @param messageName
* the protobuf MessageName to look for in the descriptor set.
* @param binaryFileDescriptorSet
* Serialized Protobuf descriptor (`FileDescriptorSet`). Typically contents of file created
* using `protoc` with `--descriptor_set_out` and `--include_imports` options.
* @since 3.5.0
*/
@Experimental
def from_protobuf(
data: Column,
messageName: String,
binaryFileDescriptorSet: Array[Byte]): Column = {
from_protobuf(data, messageName, binaryFileDescriptorSet, emptyOptions)
}
/**
* Converts a binary column of Protobuf format into its corresponding catalyst value.
* `messageClassName` points to Protobuf Java class. The jar containing Java class should be
* shaded. Specifically, `com.google.protobuf.*` should be shaded to
* `org.sparkproject.spark_protobuf.protobuf.*`.
* https://github.com/rangadi/shaded-protobuf-classes is useful to create shaded jar from
* Protobuf files.
*
* @param data
* the binary column.
* @param messageClassName
* The full name for Protobuf Java class. E.g. com.example.protos.ExampleEvent
.
* The jar with these classes needs to be shaded as described above.
* @param options
* @since 3.5.0
*/
@Experimental
def from_protobuf(
data: Column,
messageClassName: String,
options: java.util.Map[String, String]): Column = {
fnWithOptions("from_protobuf", options.asScala.iterator, data, lit(messageClassName))
}
// BEGIN-EDGE
/**
* Converts a binary column of Protobuf format into its corresponding catalyst value. The Protobuf
* schema is read from schema registry configured in `options`. The following options are
* supported and the first two options are required:
* `schema.registry.subject` : [Required] Subject for the schema in the registry.
* `schema.registry.address` : [Required] Schema registry address.
* `schema.registry.protobuf.name` : [Optional] Specifies Protobuf name explicitly. This is
* useful when the schema in registry contains multiple protobufs. By default,
* the first Protobuf defined in the schema is selected. This option allows selecting
* any other Protobuf included in the the schema. E.g. consider a sample schema
* registered as below:
* package com.example.protos;
*
* message Person {
* string name = 1;
* Location loc = 2;
* }
* message Location {
* string city = 1;
* int32 zipcode = 2;
* }
*
* If `schema.registry.protobuf.name` is not set, the first protobuf `Person` is used.
* If `schema.registry.protobuf.name` is set to `"Location"` (or its full name
* `"com.example.protos.Location"`), `Location` Protobuf is used. Similarly any imported
* Protobuf can also be specified.
*
* @param data
* the binary column.
* @param options
* Options including required configuration for schema registry as mentioned above.
* @since 3.5.0 (DBR 14.2)
*/
@Experimental
def from_protobuf(
data: Column,
options: java.util.Map[String, String]): Column = {
fnWithOptions(
"from_protobuf_schema_registry",
options.asScala.iterator,
data
)
}
// END-EDGE
/**
* Converts a column into binary of protobuf format. The Protobuf definition is provided through
* Protobuf descriptor file.
*
* @param data
* the data column.
* @param messageName
* the protobuf MessageName to look for in descriptor file.
* @param descFilePath
* The Protobuf descriptor file. This file is usually created using `protoc` with
* `--descriptor_set_out` and `--include_imports` options.
* @since 3.5.0
*/
@Experimental
def to_protobuf(data: Column, messageName: String, descFilePath: String): Column = {
to_protobuf(data, messageName, descFilePath, emptyOptions)
}
/**
* Converts a column into binary of protobuf format.The Protobuf definition is provided through
* Protobuf `FileDescriptorSet`.
*
* @param data
* the binary column.
* @param messageName
* the protobuf MessageName to look for in the descriptor set.
* @param binaryFileDescriptorSet
* Serialized Protobuf descriptor (`FileDescriptorSet`). Typically contents of file created
* using `protoc` with `--descriptor_set_out` and `--include_imports` options.
*
* @since 3.5.0
*/
@Experimental
def to_protobuf(
data: Column,
messageName: String,
binaryFileDescriptorSet: Array[Byte]): Column = {
to_protobuf(data, messageName, binaryFileDescriptorSet, emptyOptions)
}
/**
* Converts a column into binary of protobuf format. The Protobuf definition is provided through
* Protobuf descriptor file.
*
* @param data
* the data column.
* @param messageName
* the protobuf MessageName to look for in descriptor file.
* @param descFilePath
* The Protobuf descriptor file. This file is usually created using `protoc` with
* `--descriptor_set_out` and `--include_imports` options.
* @param options
* @since 3.5.0
*/
@Experimental
def to_protobuf(
data: Column,
messageName: String,
descFilePath: String,
options: java.util.Map[String, String]): Column = {
val binaryFileDescriptorSet = readDescriptorFileContent(descFilePath)
to_protobuf(data, messageName, binaryFileDescriptorSet, options)
}
/**
* Converts a column into binary of protobuf format.The Protobuf definition is provided through
* Protobuf `FileDescriptorSet`.
*
* @param data
* the binary column.
* @param messageName
* the protobuf MessageName to look for in the descriptor set.
* @param binaryFileDescriptorSet
* Serialized Protobuf descriptor (`FileDescriptorSet`). Typically contents of file created
* using `protoc` with `--descriptor_set_out` and `--include_imports` options.
* @param options
* @since 3.5.0
*/
@Experimental
def to_protobuf(
data: Column,
messageName: String,
binaryFileDescriptorSet: Array[Byte],
options: java.util.Map[String, String]): Column = {
fnWithOptions(
"to_protobuf",
options.asScala.iterator,
data,
lit(messageName),
lit(binaryFileDescriptorSet))
}
/**
* Converts a column into binary of protobuf format. `messageClassName` points to Protobuf Java
* class. The jar containing Java class should be shaded. Specifically, `com.google.protobuf.*`
* should be shaded to `org.sparkproject.spark_protobuf.protobuf.*`.
* https://github.com/rangadi/shaded-protobuf-classes is useful to create shaded jar from
* Protobuf files.
*
* @param data
* the data column.
* @param messageClassName
* The full name for Protobuf Java class. E.g. com.example.protos.ExampleEvent
.
* The jar with these classes needs to be shaded as described above.
* @since 3.5.0
*/
@Experimental
def to_protobuf(data: Column, messageClassName: String): Column = {
Column.fn("to_protobuf", data, lit(messageClassName))
}
/**
* Converts a column into binary of protobuf format. `messageClassName` points to Protobuf Java
* class. The jar containing Java class should be shaded. Specifically, `com.google.protobuf.*`
* should be shaded to `org.sparkproject.spark_protobuf.protobuf.*`.
* https://github.com/rangadi/shaded-protobuf-classes is useful to create shaded jar from
* Protobuf files.
*
* @param data
* the data column.
* @param messageClassName
* The full name for Protobuf Java class. E.g. com.example.protos.ExampleEvent
.
* The jar with these classes needs to be shaded as described above.
* @param options
* @since 3.5.0
*/
@Experimental
def to_protobuf(
data: Column,
messageClassName: String,
options: java.util.Map[String, String]): Column = {
fnWithOptions("to_protobuf", options.asScala.iterator, data, lit(messageClassName))
}
// BEGIN-EDGE
/**
* Converts a column into binary protobuf. The schema for the column should match Protobuf schema.
* The Protobuf schema is read from schema registry configured in `options`.
* See documentation for `from_protobuf()` about description of schema registry options.
*
* @param data
* the binary column.
* @param options
* Options including required configuration for schema registry as described in documentation
* for `from_protobuf()`.
* @since 3.5.0 (DBR 14.2)
*/
@Experimental
def to_protobuf(
data: Column,
options: java.util.Map[String, String]): Column = {
fnWithOptions(
"to_protobuf_schema_registry",
options.asScala.iterator,
data
)
}
// END-EDGE
private def emptyOptions: java.util.Map[String, String] = Collections.emptyMap[String, String]()
// This method is copied from org.apache.spark.sql.protobuf.util.ProtobufUtils
private def readDescriptorFileContent(filePath: String): Array[Byte] = {
try {
Files.readAllBytes(Paths.get(filePath))
} catch {
case ex: FileNotFoundException =>
throw CompilationErrors.cannotFindDescriptorFileError(filePath, ex)
case ex: NoSuchFileException =>
throw CompilationErrors.cannotFindDescriptorFileError(filePath, ex)
case NonFatal(ex) => throw CompilationErrors.descriptorParseError(ex)
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy