za.co.absa.enceladus.standardization.StandardizationPropertiesProvider.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-jobs Show documentation
The newest version!
/*
 * Copyright 2018 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.enceladus.standardization

import org.apache.spark.sql.{DataFrameReader, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import za.co.absa.enceladus.common._
import za.co.absa.enceladus.dao.EnceladusDAO
import za.co.absa.enceladus.model.Dataset
import za.co.absa.enceladus.standardization.config.StandardizationConfigParser
import za.co.absa.enceladus.utils.unicode.ParameterConversion._

import scala.collection.immutable.HashMap

/**
  * Reads standardization properties from the configuration file
  */
class StandardizationPropertiesProvider {
  private val log: Logger = LoggerFactory.getLogger(this.getClass)
  private final val SparkCSVReaderMaxColumnsDefault: Int = 20480

  /**
   * Returns a Spark reader with all format-specific options applied.
   * Options are provided by command line parameters.
   *
   * @param cmd             Command line parameters containing format-specific options
   * @param dataset         A dataset definition
   * @param numberOfColumns (Optional) number of columns, enables reading CSV files with the number of columns
   *                        larger than Spark default
   * @return The updated dataframe reader
   */
  def getFormatSpecificReader[T](cmd: StandardizationConfigParser[T], dataset: Dataset, numberOfColumns: Int = 0)
                                (implicit spark: SparkSession, dao: EnceladusDAO): DataFrameReader = {
    val dfReader = spark.read.format(cmd.rawFormat)
    // applying format specific options
    val options = getCobolOptions(cmd, dataset) ++
      getGenericOptions(cmd) ++
      getXmlOptions(cmd) ++
      getCsvOptions(cmd, numberOfColumns) ++
      getFixedWidthOptions(cmd)

    // Applying all the options
    options.foldLeft(dfReader) { (df, optionPair) =>
      optionPair match {
        case (key, Some(value)) =>
          value match {
            // Handle all .option() overloads
            case StringParameter(s) => df.option(key, s)
            case BooleanParameter(b) => df.option(key, b)
            case LongParameter(l) => df.option(key, l)
            case DoubleParameter(d) => df.option(key, d)
          }
        case (_, None) => df
      }
    }
  }

  private def getGenericOptions[T](cmd: StandardizationConfigParser[T]): HashMap[String, Option[RawFormatParameter]] = {
    val mode = if (cmd.failOnInputNotPerSchema) {
      "FAILFAST"
    } else {
      "PERMISSIVE"
    }
    HashMap(
      "charset" -> cmd.charset.map(StringParameter),
      "mode" -> Option(StringParameter(mode))
    )
  }

  private def getXmlOptions[T](cmd: StandardizationConfigParser[T]): HashMap[String, Option[RawFormatParameter]] = {
    if (cmd.rawFormat == "xml") {
      HashMap("rowtag" -> cmd.rowTag.map(StringParameter))
    } else {
      HashMap()
    }
  }

  private def getCsvOptions[T](cmd: StandardizationConfigParser[T],
                               numberOfColumns: Int = 0): HashMap[String, Option[RawFormatParameter]] = {
    if (cmd.rawFormat =="csv") {
      HashMap(
        "delimiter" -> cmd.csvDelimiter.map(s => StringParameter(s.includingUnicode.includingNone)),
        "header" -> cmd.csvHeader.map(BooleanParameter),
        "quote" -> cmd.csvQuote.map(s => StringParameter(s.includingUnicode.includingNone)),
        "escape" -> cmd.csvEscape.map(s => StringParameter(s.includingUnicode.includingNone)),
        "ignoreLeadingWhiteSpace" -> cmd.csvIgnoreLeadingWhiteSpace.map(BooleanParameter),
        "ignoreTrailingWhiteSpace" -> cmd.csvIgnoreTrailingWhiteSpace.map(BooleanParameter),
        // increase the default limit on the number of columns if needed
        // default is set at org.apache.spark.sql.execution.datasources.csv.CSVOptions maxColumns
        "maxColumns" -> {
          if (numberOfColumns > SparkCSVReaderMaxColumnsDefault) Some(LongParameter(numberOfColumns)) else None
        },
        "nullValue" -> cmd.nullValue.map(StringParameter)
      )
    } else {
      HashMap()
    }
  }

  private def getFixedWidthOptions[T](cmd: StandardizationConfigParser[T]): HashMap[String, Option[RawFormatParameter]] = {
    if (cmd.rawFormat == "fixed-width") {
      HashMap(
        "trimValues" -> cmd.fixedWidthTrimValues.map(BooleanParameter),
        "treatEmptyValuesAsNulls" -> cmd.fixedWidthTreatEmptyValuesAsNulls.map(BooleanParameter),
        "nullValue" -> cmd.nullValue.map(StringParameter),
        "charset" -> cmd.charset.map(StringParameter)
      )
    } else {
      HashMap()
    }
  }

  private def getCobolOptions[T](cmd: StandardizationConfigParser[T], dataset: Dataset)
                                (implicit dao: EnceladusDAO): HashMap[String, Option[RawFormatParameter]] = {
    if (cmd.rawFormat =="cobol") {
      val cobolOptions = cmd.cobolOptions.getOrElse(CobolOptions())
      val isAscii = cobolOptions.encoding.exists(_.equalsIgnoreCase("ascii"))
      // isXcom = variable length file (V)
      // isText = ASCII text file (D)
      // If neither not set = fixed length file (F)
      // More info: https://www.ibm.com/docs/en/zos-basic-skills?topic=set-data-record-formats
      val recordFormat = (cobolOptions.isXcom, cobolOptions.isText) match {
        case (true, _) => "V"
        case (_, true) => "D"
        case _ => "F"
      }
      // For ASCII files --charset is converted into Cobrix "ascii_charset" option
      // For EBCDIC files --charset is converted into Cobrix "ebcdic_code_page" option
      HashMap(
        getCopybookOption(cobolOptions, dataset),
        "record_format" -> Option(StringParameter(recordFormat)),
        "string_trimming_policy" -> cobolOptions.trimmingPolicy.map(StringParameter),
        "encoding" -> cobolOptions.encoding.map(StringParameter),
        "ascii_charset" -> cmd.charset.flatMap(charset => if (isAscii) Option(StringParameter(charset)) else None),
        "ebcdic_code_page" -> cmd.charset.flatMap(charset => if (!isAscii) Option(StringParameter(charset)) else None)
      )
    } else {
      HashMap()
    }
  }

  private def getCopybookOption(opts: CobolOptions, dataset: Dataset)(implicit dao: EnceladusDAO): (String, Option[RawFormatParameter]) = {
    val copybook = opts.copybook
    if (copybook.isEmpty) {
      log.info("Copybook location is not provided via command line - fetching the copybook attached to the schema...")
      val copybookContents = dao.getSchemaAttachment(dataset.schemaName, dataset.schemaVersion)
      log.info(s"Applying the following copybook:\n$copybookContents")
      ("copybook_contents", Option(StringParameter(copybookContents)))
    } else {
      log.info(s"Use copybook at $copybook")
      ("copybook", Option(StringParameter(copybook)))
    }
  }

}