com.dimajix.flowman.spec.target.FileTarget.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of flowman-spec Show documentation
A Spark based ETL tool
There is a newer version: 1.2.0-synapse3.3-spark3.3-hadoop3.3
/*
 * Copyright (C) 2019 The Flowman Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.dimajix.flowman.spec.target

import com.fasterxml.jackson.annotation.JsonProperty
import org.apache.hadoop.fs.Path

import com.dimajix.common.No
import com.dimajix.common.Trilean
import com.dimajix.common.Yes
import com.dimajix.flowman.config.FlowmanConf.DEFAULT_TARGET_OUTPUT_MODE
import com.dimajix.flowman.config.FlowmanConf.DEFAULT_TARGET_PARALLELISM
import com.dimajix.flowman.config.FlowmanConf.DEFAULT_TARGET_REBALANCE
import com.dimajix.flowman.execution.Context
import com.dimajix.flowman.execution.Execution
import com.dimajix.flowman.execution.ExecutionException
import com.dimajix.flowman.execution.MappingUtils
import com.dimajix.flowman.execution.OutputMode
import com.dimajix.flowman.execution.Phase
import com.dimajix.flowman.execution.VerificationFailedException
import com.dimajix.flowman.fs.HadoopUtils
import com.dimajix.flowman.model.BaseTarget
import com.dimajix.flowman.model.MappingOutputIdentifier
import com.dimajix.flowman.model.ResourceIdentifier
import com.dimajix.flowman.model.Target
import com.dimajix.flowman.model.TargetDigest


object FileTarget {
    def apply(context: Context, mapping: MappingOutputIdentifier, location: Path, format: String, options: Map[String, String]) : FileTarget = {
        val conf = context.flowmanConf
        new FileTarget(
            Target.Properties(context),
            mapping,
            location,
            format,
            options,
            OutputMode.ofString(conf.getConf(DEFAULT_TARGET_OUTPUT_MODE)),
            conf.getConf(DEFAULT_TARGET_PARALLELISM),
            conf.getConf(DEFAULT_TARGET_REBALANCE)
        )
    }
}
final case class FileTarget(
    instanceProperties: Target.Properties,
    mapping:MappingOutputIdentifier,
    location:Path,
    format:String,
    options:Map[String,String],
    mode:OutputMode,
    parallelism: Int,
    rebalance: Boolean = false
) extends BaseTarget {
    private val qualifiedLocation = {
        val conf = context.hadoopConf
        val fs = location.getFileSystem(conf)
        location.makeQualified(fs.getUri, fs.getWorkingDirectory)
    }

    /**
      * Returns an instance representing this target with the context
      *
      * @return
      */
    override def digest(phase:Phase): TargetDigest = {
        TargetDigest(
            namespace.map(_.name).getOrElse(""),
            project.map(_.name).getOrElse(""),
            name,
            phase,
            Map("location" -> qualifiedLocation.toString)
        )
    }

    /**
     * Returns all phases which are implemented by this target in the execute method
     * @return
     */
    override def phases : Set[Phase] = Set(Phase.CREATE, Phase.BUILD, Phase.VERIFY, Phase.TRUNCATE, Phase.DESTROY)

    /**
      * Returns a list of physical resources produced by this target
      *
      * @return
      */
    override def provides(phase: Phase): Set[ResourceIdentifier] = Set(
        ResourceIdentifier.ofFile(qualifiedLocation)
    )

    /**
      * Returns a list of physical resources required by this target
      * @return
      */
    override def requires(phase: Phase) : Set[ResourceIdentifier] = {
        phase match {
            case Phase.BUILD => MappingUtils.requires(context, mapping.mapping)
            case _ => Set()
        }
    }

    /**
     * Returns the state of the target, specifically of any artifacts produces. If this method return [[Yes]],
     * then an [[execute]] should update the output, such that the target is not 'dirty' any more.
     *
     * @param execution
     * @param phase
     * @return
     */
    override def dirty(execution: Execution, phase: Phase): Trilean = {
        phase match {
            case Phase.CREATE =>
                val fs = qualifiedLocation.getFileSystem(execution.spark.sparkContext.hadoopConfiguration)
                !fs.getFileStatus(qualifiedLocation).isDirectory
            case Phase.BUILD =>
                val fs = qualifiedLocation.getFileSystem(execution.spark.sparkContext.hadoopConfiguration)
                !HadoopUtils.isValidFileData(fs, qualifiedLocation)
            case Phase.VERIFY => Yes
            case Phase.TRUNCATE =>
                val fs = qualifiedLocation.getFileSystem(execution.spark.sparkContext.hadoopConfiguration)
                fs.listStatus(qualifiedLocation).nonEmpty
            case Phase.DESTROY =>
                val fs = qualifiedLocation.getFileSystem(execution.spark.sparkContext.hadoopConfiguration)
                fs.exists(qualifiedLocation)
            case _ => No
        }
    }

    override def create(execution: Execution) : Unit = {
        require(execution != null)

        val fs = qualifiedLocation.getFileSystem(execution.spark.sparkContext.hadoopConfiguration)
        if (!fs.getFileStatus(qualifiedLocation).isDirectory) {
            logger.info(s"Creating directory '$qualifiedLocation' for file relation '$identifier'")
            fs.mkdirs(qualifiedLocation)
        }
    }

    /**
      * Abstract method which will perform the output operation. All required tables need to be
      * registered as temporary tables in the Spark session before calling the execute method.
      *
      * @param execution
      */
    override def build(execution: Execution): Unit = {
        require(execution != null)

        logger.info(s"Writing mapping output '${mapping}' to directory '$qualifiedLocation'")
        val map = context.getMapping(mapping.mapping)
        val dfIn = execution.instantiate(map, mapping.output)
        val table = {
            if (parallelism <= 0)
                dfIn
            else if (rebalance)
                dfIn.repartition(parallelism)
            else
                dfIn.coalesce(parallelism)
        }

        val dfCount = countRecords(execution, table)
        dfCount.write
            .options(options)
            .format(format)
            .mode(mode.batchMode)
            .save(qualifiedLocation.toString)
    }

    /**
      * Performs a verification of the build step or possibly other checks.
      *
      * @param execution
      */
    override def verify(execution: Execution) : Unit = {
        require(execution != null)

        val file = execution.fs.file(qualifiedLocation)
        if (!file.exists()) {
            val error = s"Verification of target '$identifier' failed - location '$qualifiedLocation' does not exist"
            logger.error(error)
            throw new VerificationFailedException(identifier, new ExecutionException(error))
        }
    }

    /**
      * Cleans up a specific target
      *
      * @param execution
      */
    override def truncate(execution: Execution): Unit = {
        require(execution != null)

        val fs = qualifiedLocation.getFileSystem(execution.spark.sparkContext.hadoopConfiguration)
        if (fs.getFileStatus(qualifiedLocation).isDirectory) {
            logger.info(s"Truncating directory '$qualifiedLocation' of file relation '$identifier'")
            val files = fs.listStatus(qualifiedLocation)
            files.foreach(file => fs.delete(file.getPath, true))
        }
    }

    override def destroy(execution: Execution) : Unit = {
        require(execution != null)

        val fs = qualifiedLocation.getFileSystem(execution.spark.sparkContext.hadoopConfiguration)
        if (fs.exists(qualifiedLocation)) {
            logger.info(s"Deleting directory '$qualifiedLocation' of file relation '$identifier'")
            fs.delete(qualifiedLocation, true)
        }
    }
}



class FileTargetSpec extends TargetSpec {
    @JsonProperty(value="mapping", required=true) private var mapping:String = _
    @JsonProperty(value="location", required=true) private var location:String = _
    @JsonProperty(value="format", required=false) private var format:String = "csv"
    @JsonProperty(value="mode", required=false) private var mode:Option[String] = None
    @JsonProperty(value="options", required=false) private var options:Map[String,String] = Map()
    @JsonProperty(value="parallelism", required=false) private var parallelism:Option[String] = None
    @JsonProperty(value="rebalance", required=false) private var rebalance:Option[String] = None

    override def instantiate(context: Context, properties:Option[Target.Properties] = None): FileTarget = {
        val conf = context.flowmanConf
        FileTarget(
            instanceProperties(context, properties),
            MappingOutputIdentifier.parse(context.evaluate(mapping)),
            new Path(context.evaluate(location)),
            context.evaluate(format),
            context.evaluate(options),
            OutputMode.ofString(context.evaluate(mode).getOrElse(conf.getConf(DEFAULT_TARGET_OUTPUT_MODE))),
            context.evaluate(parallelism).map(_.toInt).getOrElse(conf.getConf(DEFAULT_TARGET_PARALLELISM)),
            context.evaluate(rebalance).map(_.toBoolean).getOrElse(conf.getConf(DEFAULT_TARGET_REBALANCE))
        )
    }
}