All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.dimajix.spark.sql.DataFrameBuilder.scala Maven / Gradle / Ivy

There is a newer version: 1.2.0-synapse3.3-spark3.3-hadoop3.3
Show newest version
/*
 * Copyright (C) 2021 The Flowman Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.dimajix.spark.sql

import scala.collection.JavaConverters._

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.types.StructType

import com.dimajix.spark.sql.catalyst.PlanUtils


object DataFrameBuilder {
    private val rowParserOptions = RowParser.Options()

    /**
     * Creates a DataFrame from a sequence of string array records. The string values will be converted to appropriate
     * data types as specified in the schema
     *
     * @param sparkSession
     * @param lines
     * @param schema
     * @return
     */
    def ofStringValues(sparkSession: SparkSession, lines:Seq[Array[String]], schema:StructType) : DataFrame = {
        val reader = new RowParser(schema, rowParserOptions)
        val rows = lines.map(reader.parse)
        sparkSession.createDataFrame(rows.asJava, schema)
    }

    /**
     * Create an empty [[DataFrame]] from a schema
     *
     * @param sparkSession
     * @param schema
     * @return
     */
    def ofSchema(sparkSession: SparkSession, schema:StructType) : DataFrame = {
        val rdd = sparkSession.sparkContext.emptyRDD[Row]
        sparkSession.createDataFrame(rdd, schema)
    }

    /**
     * Creates a DataFrame from a sequence of Spark [[Row]] objects and a Spark schema.
     *
     * @param sparkSession
     * @param rows
     * @param schema
     * @return
     */
    def ofRows(sparkSession: SparkSession, rows:Seq[Row], schema:StructType): DataFrame = {
        sparkSession.createDataFrame(rows.asJava, schema)
    }

    /**
     * Creates a DataFrame from a Spark [[LogicalPlan]]
     *
     * @param sparkSession
     * @param logicalPlan
     * @return
     */
    def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = {
        val qe = sparkSession.sessionState.executePlan(logicalPlan)
        qe.assertAnalyzed()
        new Dataset[Row](sparkSession, logicalPlan, RowEncoder(qe.analyzed.schema))
    }

    /**
     * Creates a DataFrame containing a single Row for a given Schema, either with NULL values or with default values.
     *
     * @param sparkSession
     * @param schema
     * @return
     */
    def singleRow(sparkSession: SparkSession, schema: StructType): DataFrame = {
        val logicalPlan = PlanUtils.singleRowPlan(schema)
        new Dataset[Row](sparkSession, logicalPlan, RowEncoder(schema))
    }

    /**
     * Creates a DataFrame containing a single Row for a given Schema, either with NULL values or with default values.
     *
     * @param sparkSession
     * @param schema
     * @return
     */
    def namedAttributes(sparkSession: SparkSession, schema: StructType): DataFrame = {
        val logicalPlan = PlanUtils.namedAttributePlan(schema)
        new Dataset[Row](sparkSession, logicalPlan, RowEncoder(schema))
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy