All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.dimajix.flowman.spec.dataset.RelationDataset.scala Maven / Gradle / Ivy

There is a newer version: 1.2.0-synapse3.3-spark3.3-hadoop3.3
Show newest version
/*
 * Copyright 2019 Kaya Kupferschmidt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.dimajix.flowman.spec.dataset

import com.fasterxml.jackson.annotation.JsonProperty
import org.apache.spark.sql.DataFrame

import com.dimajix.common.Trilean
import com.dimajix.flowman.execution.Context
import com.dimajix.flowman.execution.Execution
import com.dimajix.flowman.execution.OutputMode
import com.dimajix.flowman.model.AbstractInstance
import com.dimajix.flowman.model.Dataset
import com.dimajix.flowman.model.Reference
import com.dimajix.flowman.model.Relation
import com.dimajix.flowman.model.RelationIdentifier
import com.dimajix.flowman.model.RelationReference
import com.dimajix.flowman.model.ResourceIdentifier
import com.dimajix.flowman.spec.relation.RelationReferenceSpec
import com.dimajix.flowman.types.SingleValue
import com.dimajix.flowman.types.StructType


object RelationDataset {
    def apply(context: Context, relation:RelationIdentifier, partition:Map[String,SingleValue]) : RelationDataset = {
        new RelationDataset(
            Dataset.Properties(context, "relation(" + relation.name + ")", "relation"),
            RelationReference(context,relation),
            partition
        )
    }
}
case class RelationDataset(
    instanceProperties: Dataset.Properties,
    relation: Reference[Relation],
    partition:Map[String,SingleValue]
) extends AbstractInstance with Dataset {
    /**
      * Returns a list of physical resources produced by writing to this dataset
      * @return
      */
    override def provides : Set[ResourceIdentifier] = {
        val instance = relation.value
        instance.provides ++ instance.resources(partition)
    }

    /**
     * Returns a list of physical resources required for reading from this dataset
     * @return
     */
    override def requires : Set[ResourceIdentifier] = {
        val instance = relation.value
        instance.provides ++ instance.requires ++ instance.resources(partition)
    }

    /**
      * Returns true if the data represented by this Dataset actually exists
      *
      * @param execution
      * @return
      */
    override def exists(execution: Execution): Trilean = {
        val instance = relation.value
        instance.loaded(execution, partition)
    }

    /**
      * Removes the data represented by this dataset, but leaves the underlying relation present
      *
      * @param execution
      */
    override def clean(execution: Execution): Unit = {
        val instance = relation.value
        instance.truncate(execution, partition)
    }

    /**
      * Reads data from the relation, possibly from specific partitions
      *
      * @param execution
      * @param schema - the schema to read. If none is specified, all available columns will be read
      * @return
      */
    override def read(execution: Execution): DataFrame = {
        val instance = relation.value
        instance.read(execution, partition)
    }

    /**
      * Writes data into the relation, possibly into a specific partition
      *
      * @param execution
      * @param df - dataframe to write
      */
    override def write(execution: Execution, df: DataFrame, mode: OutputMode): Unit = {
        val instance = relation.value
        // Remove partition columns
        val outDf = partition.keys.foldLeft(df)((df,col) => df.drop(col))
        instance.write(execution, outDf, partition, mode)
    }

    /**
      * Returns the schema as produced by this dataset. The schema will not include any partition columns
      *
      * @return
      */
    override def describe(execution:Execution) : Option[StructType] = {
        val schema = execution.describe(relation.value, partition)
        Some(schema)
    }
}


class RelationDatasetSpec extends DatasetSpec {
    @JsonProperty(value="relation", required = true) private var relation: RelationReferenceSpec = _
    @JsonProperty(value="partition", required = false) private var partition: Map[String, String] = Map()

    override def instantiate(context: Context): RelationDataset = {
        val rel = relation.instantiate(context)
        RelationDataset(
            instanceProperties(context, rel.identifier.toString),
            relation.instantiate(context),
            partition.map { case(n,p) => (n,SingleValue(context.evaluate(p))) }
        )
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy