ai.starlake.schema.generator.XlsModel.scala Maven / Gradle / Ivy

Go to download
package ai.starlake.schema.generator

import org.apache.poi.ss.usermodel.{Cell, DataFormatter, Row, Sheet, Workbook}
import org.apache.poi.xssf.usermodel.XSSFSheet

import scala.jdk.CollectionConverters._

trait XlsModel {

  val allDomainHeaders = List(
    "_name"        -> "Name",
    "_path"        -> "Directory",
    "_ack"         -> "Ack",
    "_description" -> "Description",
    "_rename"      -> "Rename",
    "_tags"        -> "Tags",
    "_dagRef"      -> "Dag reference",
    "_frequency"   -> "Job frenquency"
  )
  val allPolicyHeaders = List(
    "_name"        -> "Name",
    "_predicate"   -> "Predicate",
    "_grants"      -> "User Groups",
    "_description" -> "Description"
  )

  val allIamPolicyTagHeaders = List(
    "_policyTag" -> "Policy Tag",
    "_members"   -> "Members",
    "_role"      -> "Role"
  )

  val allSchemaHeaders = List(
    "_name"               -> "Name",
    "_pattern"            -> "Pattern",
    "_mode"               -> "FILE or STREAM",
    "_write"              -> "Write Mode\n(OVERWRITE, APPEND, ERROR_IF_EXISTS)",
    "_format"             -> "DSV, POSITION, XML, JSON",
    "_header"             -> "Hash header (true / false)",
    "_delimiter"          -> "Separator",
    "_delta_column"       -> "Timestamp column to use on merge",
    "_merge_keys"         -> "Merge columns",
    "_description"        -> "Description",
    "_encoding"           -> "File encoding (UTF-8 by default)",
    "_sampling"           -> "Sampling strategy(obsolete)",
    "_partitioning"       -> "partition columns",
    "_sink"               -> "Sink Type",
    "_clustering"         -> "Clustering columns",
    "_merge_query_filter" -> "Filter to use on merge",
    "_presql"             -> "Pre SQLs - ###",
    "_postsql"            -> "Post SQLs - ###",
    "_primary_key"        -> "Primary Key",
    "_tags"               -> "Tags",
    "_rename"             -> "Rename",
    "_long_name"          -> "Rename source table",
    "_policy"             -> "Access Policy",
    "_escape"             -> "Escaping Char",
    // new fields
    "_multiline"  -> "Multiline",
    "_array"      -> "Is JSON array",
    "_quote"      -> "Quote character",
    "_ignore"     -> "UDF to apply to ignore input lines",
    "_xml"        -> "XML Options",
    "_extensions" -> "Accepted extensions",
    "_options"    -> "Spark ingestion options",
    "_validator"  -> "Class validator",
    "_null"       -> "Null value",
    "_dagRef"     -> "Dag reference",
    "_frequency"  -> "Job frenquency"
  )

  val allAttributeHeaders = List(
    "_name"           -> "Name",
    "_rename"         -> "New Name",
    "_type"           -> "Semantic Type",
    "_required"       -> "Required(true / false)",
    "_privacy"        -> "Privacy (MD5, SHA1, Initials ...)",
    "_metric"         -> "Metric (CONTINUOUS, DISCRETE ...)",
    "_default"        -> "Default value",
    "_script"         -> "Script",
    "_description"    -> "Description",
    "_position_start" -> "Start Position",
    "_position_end"   -> "End Position",
    "_trim"           -> "Trim (LEFT, RIGHT,BOTH)",
    "_ignore"         -> "Ignore ?",
    "_foreign_key"    -> "Foreign Key",
    "_tags"           -> "Tags",
    "_policy"         -> "Access Policy"
  )

  val allSchemaJobHeaders = List(
    "_job"               -> "Job Name",
    "_domain"            -> "Domain",
    "_name"              -> "Name",
    "_source"            -> "Tables sources for job",
    "_write"             -> "Write Mode\n(OVERWRITE, APPEND, ERROR_IF_EXISTS)",
    "_frequency"         -> "Job frenquency",
    "_partition"         -> "Partition column",
    "_description"       -> "Description",
    "_policy"            -> "Access Policy",
    "_database"          -> "Database",
    "_clustering"        -> "Clustering columns",
    "_tags"              -> "Tags",
    "_presql"            -> "Pre SQLs - ###",
    "_postsql"           -> "Post SQLs - ###",
    "_sink"              -> "Sink type",
    "_sinkConnectionRef" -> "Sink connection name",
    "_options"           -> "Sink Options",
    "_format"            -> "FS Sink Format",
    "_extension"         -> "FS Sink Extension",
    "_coalesce"          -> "FS Sink Coalesce",
    "_connection"        -> "Run connection type",
    "_connectionRef"     -> "Run connection name",
    "_dagRef"            -> "Dag reference"
  )

  val allAttributeJobHeaders = List(
    "_name"        -> "Name",
    "_type"        -> "Semantic Type",
    "_required"    -> "Required(true / false)",
    "_script"      -> "Script",
    "_description" -> "Description"
  )

  protected def getColsOrder(
    sheet: Sheet,
    allHeaders: List[String]
  ): (Iterable[Row], Map[String, Int]) = {
    val scalaSheet = sheet.asScala
    if (scalaSheet == null) {
      throw new IllegalArgumentException("Sheet is empty")
    }
    val hasSchema = scalaSheet.head
      .getCell(0, Row.MissingCellPolicy.RETURN_BLANK_AS_NULL)
      .getStringCellValue
      .startsWith("_")
    if (hasSchema) {
      val headersRow = scalaSheet.head
      val headerMap = headersRow
        .cellIterator()
        .asScala
        .zipWithIndex
        .map { case (headerCell, index) =>
          val header = headerCell.getStringCellValue
          (header, index)
        }
        .toMap
      (scalaSheet.drop(2), headerMap)
    } else {
      (scalaSheet.drop(1), allHeaders.zipWithIndex.toMap)
    }
  }

  object formatter {
    private val f = new DataFormatter()

    def formatCellValue(cell: Cell): Option[String] = {
      // remove all no-breaking spaces from cell to avoid parsing errors
      f.formatCellValue(cell).trim.replaceAll("\\u00A0", "") match {
        case v if v.isEmpty => None
        case v              => Some(v)
      }
    }

    def formatCellWithBlankValue(cell: Cell): Option[String] = {
      formatCellValue(cell) match {
        case Some(v) if v.toLowerCase == "blank" => Some("")
        case v                                   => v
      }
    }
  }

  def fillHeaders(workbook: Workbook, headers: List[(String, String)], sheet: XSSFSheet): Unit = {
    val font = workbook.createFont
    font.setFontHeightInPoints(14.toShort)
    font.setFontName("Calibri")
    font.setBold(true)
    val boldStyle = workbook.createCellStyle()
    boldStyle.setFont(font)
    val header = sheet.createRow(0)
    header.setHeight(0) // Hide header
    headers.map { case (key, _) => key }.zipWithIndex.foreach { case (key, columnIndex) =>
      val cell = header.createCell(columnIndex)
      cell.setCellValue(key)
    }
    val labelHeader = sheet.createRow(1)
    headers.map { case (_, value) => value }.zipWithIndex.foreach { case (value, columnIndex) =>
      val cell = labelHeader.createCell(columnIndex)
      cell.setCellValue(value)
      cell.setCellStyle(boldStyle)
    }
  }

}