All Downloads are FREE. Search and download functionalities are using the official Maven repository.

za.co.absa.enceladus.standardization.config.StandardizationConfigParser.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2018 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.enceladus.standardization.config

import scopt.{OParser, OParserBuilder}
import za.co.absa.enceladus.common.config.JobConfigParser
import za.co.absa.enceladus.standardization.CobolOptions

trait StandardizationConfigParser[R] extends JobConfigParser[R] {
  def withRawFormat(value: String): R
  def withCharset(value: Option[String] = None): R
  def withNullValue(value: Option[String] = None): R
  def withRowTag(value: Option[String] = None): R
  def withCsvDelimiter(value: Option[String] = None): R
  def withCsvHeader(value: Option[Boolean] = Some(false)): R
  def withCsvQuote(value: Option[String] = None): R
  def withCsvEscape(value: Option[String] = None): R
  def withCsvIgnoreLeadingWhiteSpace(value: Option[Boolean] = None): R
  def withCsvIgnoreTrailingWhiteSpace(value: Option[Boolean] = None): R
  def withCobolOptions(value: Option[CobolOptions] = None): R
  def withFixedWidthTrimValues(value: Option[Boolean] = None): R
  def withRawPathOverride(value: Option[String]): R
  def withFailOnInputNotPerSchema(value: Boolean): R
  def withFixedWidthTreatEmptyValuesAsNulls(value: Option[Boolean] = None): R

  def rawFormat: String
  def charset: Option[String]
  def nullValue: Option[String]
  def rowTag: Option[String]
  def csvDelimiter: Option[String]
  def csvHeader: Option[Boolean]
  def csvQuote: Option[String]
  def csvEscape: Option[String]
  def csvIgnoreLeadingWhiteSpace: Option[Boolean]
  def csvIgnoreTrailingWhiteSpace: Option[Boolean]
  def cobolOptions: Option[CobolOptions]
  def fixedWidthTrimValues: Option[Boolean]
  def rawPathOverride: Option[String]
  def failOnInputNotPerSchema: Boolean
  def fixedWidthTreatEmptyValuesAsNulls: Option[Boolean]
}

object StandardizationConfigParser {

  private val csvFormatName = "CSV"
  private val cobolFormatName = "COBOL"
  private val fixedWidthFormatName = "FixedWidth"
  private val xmlFormatName = "XML"
  private val jsonFormatName = "JSON"


  //scalastyle:off method.length the length is legit for parsing input paramters
  def standardizationParser[R <: StandardizationConfigParser[R]]: OParser[_, R] = {
    val builder = OParser.builder[R]
    import builder._
    OParser.sequence(
      opt[String]('f', "raw-format").required().action((value, config) => {
        config.withRawFormat(value.toLowerCase())
      }).text("format of the raw data (csv, xml, parquet, fixed-width, etc.)"),

      opt[String]("charset").optional().action((value, config) =>
        config.withCharset(Some(value))).text("use the specific charset (default is UTF-8)"),

      opt[String]("null-value").optional()
        .action((value, config) => config.withNullValue(Some(value)))
        .text(s"For $csvFormatName and $fixedWidthFormatName file format. Sets the representation of a null value. Defaults is empty string."), //scalastyle:ignore maxLineLength

      opt[String]("row-tag").optional().action((value, config) =>
        config.withRowTag(Some(value))).text("use the specific row tag instead of 'ROW' for XML format"),

      opt[String]("delimiter").optional().action((value, config) =>
        config.withCsvDelimiter(Some(value))).text("use the specific delimiter instead of ',' for CSV format"),

      opt[String]("csv-quote").optional().action((value, config) =>
        config.withCsvQuote(Some(value)))
        .text("use the specific quote character for creating CSV fields that may contain delimiter character(s) (default is '\"')"),

      opt[String]("csv-escape").optional().action((value, config) =>
        config.withCsvEscape(Some(value)))
        .text("use the specific escape character for CSV fields (default is '\\')"),

      opt[Boolean]("csv-ignore-leading-white-space").optional().action((value, config) =>
        config.withCsvIgnoreLeadingWhiteSpace(Some(value)))
        .text("ignore leading whitespaces for each column"),

      opt[Boolean]("csv-ignore-trailing-white-space").optional().action((value, config) =>
        config.withCsvIgnoreTrailingWhiteSpace(Some(value)))
        .text("ignore trailing whitespaces for each column"),

      // no need for validation for boolean since scopt itself will do
      opt[Boolean]("header").optional().action((value, config) =>
        config.withCsvHeader(Some(value))).text("use the header option to consider CSV header"),

      opt[Boolean]("trimValues").optional().action((value, config) =>
        config.withFixedWidthTrimValues(Some(value))).text("use --trimValues option to trim values in  fixed width file"),

      opt[Boolean]("strict-schema-check").optional().action((value, config) =>
        config.withFailOnInputNotPerSchema(value))
        .text("use --strict-schema-check option to fail or proceed over rows not adhering to the schema (with error in errCol)"),

      opt[String]("copybook").optional().action((value, config) => {
        val newOptions = config.cobolOptions match {
          case Some(a) => Some(a.copy(copybook = value))
          case None => Some(CobolOptions(value))
        }
        config.withCobolOptions(newOptions)

      }).text("Path to a copybook for COBOL data format"),

      opt[Boolean]("is-xcom").optional().action((value, config) => {
        val newOptions = config.cobolOptions match {
          case Some(a) => Some(a.copy(isXcom = value))
          case None => Some(CobolOptions(isXcom = value))
        }
        config.withCobolOptions(newOptions)
      }).text("Does a mainframe file in COBOL format contain XCOM record headers"),

      opt[Boolean]("cobol-is-text").optional().action((value, config) => {
        val newOptions = config.cobolOptions match {
          case Some(a) => Some(a.copy(isText = value))
          case None => Some(CobolOptions(isText = value))
        }
        config.withCobolOptions(newOptions)
      }).text("Specifies if the mainframe file is ASCII text file"),

      opt[String]("cobol-encoding").optional().action((value, config) => {
        val newOptions = config.cobolOptions match {
          case Some(a) => Some(a.copy(encoding = Option(value)))
          case None => Some(CobolOptions(encoding = Option(value)))
        }
        config.withCobolOptions(newOptions)
      }).text("Specify encoding of mainframe files (ascii or ebcdic)"),

      opt[String]("cobol-trimming-policy").optional().action((value, config) => {
        val newOptions = config.cobolOptions match {
          case Some(a) => Some(a.copy(trimmingPolicy = Option(value)))
          case None => Some(CobolOptions(trimmingPolicy = Option(value)))
        }
        config.withCobolOptions(newOptions)
      }).text("Specify string trimming policy for mainframe files (none, left, right, both)"),

      opt[String]("debug-set-raw-path").optional().hidden().action((value, config) =>
        config.withRawPathOverride(Some(value)))
        .text("override the path of the raw data (used internally for performance tests)"),

      opt[Boolean]("empty-values-as-nulls").optional()
        .action((value, config) => config.withFixedWidthTreatEmptyValuesAsNulls(Some(value)))
        .text("For FixedWidth file format. Treats empty values as null. Default is false"),

      checkConfig(checkConfigX(_, builder))
    )
  }
  //scalastyle:on method.length

  private val formatsSupportingCharset = List("xml", "csv", "json", "cobol", "fixed-width")

  private def unsupportedOptionError(option: String, format: String): String = {
    unsupportedOptionError(option, Seq(format))
  }

  private def unsupportedOptionError(option: String, formats: Seq[String]): String = {
    def mkErrorMessage(format: String, s: String = ""): String = s"The $option option is supported only for $format format$s"

    def mkErrorMessageForMoreFormats(allFormats: Seq[String]): String = {
      val revertedFormats = allFormats.reverse
      val format = revertedFormats.tail.reverse.mkString(", ") + " and " + revertedFormats.head
      mkErrorMessage(format, "s")
    }

    formats match {
      case Seq()       => ""
      case Seq(format) => mkErrorMessage(format)
      case _           => mkErrorMessageForMoreFormats(formats)
    }
  }

  private def checkCharset[R <: StandardizationConfigParser[R]](config: R): List[String] = {
    if (!formatsSupportingCharset.contains(config.rawFormat) && config.charset.isDefined) {
      List(unsupportedOptionError("--charset", Seq(csvFormatName, jsonFormatName, xmlFormatName, cobolFormatName, fixedWidthFormatName)))
    } else {
      List.empty
    }
  }

  private def checkXMLFields[R <: StandardizationConfigParser[R]](config: R): List[String] = {
    if (config.rowTag.isDefined && config.rawFormat != "xml") {
      List(unsupportedOptionError("--row-tag", xmlFormatName))
    } else {
      List.empty
    }
  }

  private def checkCSVFields[R <: StandardizationConfigParser[R]](config: R): Seq[String] = {
    if (config.rawFormat == "csv") {
      Seq.empty
    } else {
      Seq (
        config.csvDelimiter.map(_ => unsupportedOptionError("--delimiter", csvFormatName)),
        config.csvEscape.map(_ => unsupportedOptionError("--escape", csvFormatName)),
        config.csvHeader.map(_ => unsupportedOptionError("--header", csvFormatName)),
        config.csvQuote.map(_ => unsupportedOptionError("--quote", csvFormatName)),
        config.csvIgnoreLeadingWhiteSpace.map(_ => unsupportedOptionError("--csv-ignore-leading-white-space", csvFormatName)),
        config.csvIgnoreTrailingWhiteSpace.map(_ => unsupportedOptionError("--csv-ignore-trailing-white-space", csvFormatName))
      ).flatten
    }
  }

  private def checkCobolFields[R <: StandardizationConfigParser[R]](config: R): Seq[String] = {
    def cobolFieldsThatShouldNotBePresent(cobolOptions: CobolOptions): List[String] = {
      val definedFields = Map(
        unsupportedOptionError("--copybook", cobolFormatName) -> (cobolOptions.copybook != ""),
        unsupportedOptionError("--cobol-encoding", cobolFormatName) -> cobolOptions.encoding.isDefined,
        unsupportedOptionError("--is-xcom", cobolFormatName) -> cobolOptions.isXcom,
        unsupportedOptionError("--is-text", cobolFormatName) -> cobolOptions.isText
      )
      definedFields.filter { case (_, value) => value }.keys.toList
    }


    if (config.rawFormat == "cobol") {
      List.empty
    } else {
      config.cobolOptions
        .map(cobolFieldsThatShouldNotBePresent)
        .getOrElse(List.empty)
    }
  }

  private def checkFixedWidthFields[R <: StandardizationConfigParser[R]](config: R): Seq[String] = {
    if (config.rawFormat == "fixed-width") {
      Seq.empty
    } else {
      Seq(
        config.fixedWidthTrimValues.map(_ => unsupportedOptionError("--trimValues", fixedWidthFormatName)),
        config.fixedWidthTreatEmptyValuesAsNulls.map(_ => unsupportedOptionError("--empty-values-as-nulls", fixedWidthFormatName))
      ).flatten
    }
  }

  private def checkCSVAndFixedWidthFields[R <: StandardizationConfigParser[R]](config: R): Seq[String] = {
    if ((config.rawFormat == "csv") || (config.rawFormat == "fixed-width")) {
      List.empty
    } else {
      config.nullValue.map(_ => unsupportedOptionError("--null-value", Seq(csvFormatName, fixedWidthFormatName))).toSeq
    }
  }

  private def checkConfigX[R <: StandardizationConfigParser[R]](config: R, builder: OParserBuilder[R]): Either[String, Unit] = {
    val allErrors:List[String] = checkCharset(config) ++
      checkXMLFields(config) ++
      checkCSVFields(config) ++
      checkCobolFields(config) ++
      checkFixedWidthFields(config) ++
      checkCSVAndFixedWidthFields(config)

    if (allErrors.isEmpty) {
      builder.success
    } else {
      builder.failure(allErrors.mkString("\n"))
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy