All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.prophecy.libs.fixedFormat.scala Maven / Gradle / Ivy

There is a newer version: 6.3.0-3.3.0
Show newest version
/*
 * ====================================================================
 *
 * PROPHECY CONFIDENTIAL
 *
 * Prophecy Inc
 * All Rights Reserved.
 *
 * NOTICE:  All information contained herein is, and remains
 * the property of Prophecy Inc, the intellectual and technical concepts contained
 * herein are proprietary to Prophecy Inc and may be covered by U.S. and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from Prophecy Inc.
 *
 * ====================================================================
 */
package io.prophecy.libs

import io.prophecy.abinitio.xfr.ast.{
  CustomBody,
  CustomFunctionDefinition,
  CustomTransform,
  OutAssignCustomStatement,
  SimpleVariableCustomTerm
}
import io.prophecy.abinitio.xfr.parse.CustomCompiler
import io.prophecy.libs.FixedFormatSchemaImplicits._
import io.prophecy.libs.utils.{BooleanExpressionEvaluator, getLengthFromArraySizeInfo}
import org.apache.commons.lang.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, LocatedFileStatus, Path}
import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.util.ArrayData
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.sources.DataSourceRegister
import org.apache.spark.sql.types.{DoubleType, _}
import org.apache.spark.sql.{DataFrame,        SaveMode, SparkSession}
import org.apache.spark.unsafe.types.UTF8String
import org.slf4j.LoggerFactory
import play.api.libs.json.Json

import java.io.OutputStream
import java.math.{BigInteger, MathContext}
import java.nio.{ByteBuffer,  ByteOrder}
import scala.collection.mutable
import scala.util.Try
import scala.util.control.NonFatal

object FixedFileFormatImplicits {}

trait FixedFileFormatImplicits {

  implicit class FixedFileFormatDataFrame(val dataFrame: DataFrame) {

    def writeFixedFile(
      schema:            FFSchemaRecord,
      path:              String,
      maxRecordsPerFile: Option[Int] = None,
      partitionBy:       Option[List[String]] = None,
      fileName:          Option[(List[String], String) ⇒ String] = None,
      mode:              SaveMode = SaveMode.ErrorIfExists
    ): Unit = {
      val fixedSchema = Json.stringify(Json.toJson(schema))

      if (maxRecordsPerFile.isEmpty && partitionBy.isEmpty && fileName.isEmpty) {
        dataFrame.write
          .mode(mode)
          .option("schema", fixedSchema)
          .format("io.prophecy.libs.FixedFileFormat")
          .save(path)
      } else {
        assert(
          partitionBy.isDefined && fileName.isDefined,
          "FixedFormat error: Both parameters 'partitionBy' and 'fileName' must be present to write to custom file names.'"
        )

        val dfForPartitioning = partitionBy.get.foldLeft(dataFrame) {
          case (df, column) ⇒ df.withColumn(s"_$column", col(column))
        }
        val pathTmp = path + "/tmp"

        val dfWrite = dfForPartitioning.write
        val dfWriteWithOptional = maxRecordsPerFile
          .map { maxRecordsPerFile ⇒
            dfWrite.option("maxRecordsPerFile", maxRecordsPerFile)
          }
          .getOrElse(dfWrite)
        dfWriteWithOptional
          .mode(mode)
          .option("schema", fixedSchema)
          .partitionBy(partitionBy.get.map("_" + _): _*)
          .format("io.prophecy.libs.FixedFileFormat")
          .save(pathTmp)

        val fileSystem                = FileSystem.get(dataFrame.sparkSession.sparkContext.hadoopConfiguration)
        val filesIterator             = fileSystem.listFiles(new Path(pathTmp), true)
        val partitionsFilePartCounter = mutable.Map[Set[String], Int]()

        val files = {
          var files = List[LocatedFileStatus]()
          while (filesIterator.hasNext) files = filesIterator.next() :: files

          files
        }

        files.map { file ⇒
          if (file.getPath.getName.endsWith(".bin")) {
            val pathFileSplit = file.getPath.toString.split('/')

            val filePartitions = partitionBy.get.zipWithIndex.map {
              case (_, idx) ⇒
                val pathPartPartition = pathFileSplit(pathFileSplit.length - 2 - idx)
                (pathPartPartition, pathPartPartition.split('=').tail.mkString("="))
            }

            val partitionsFilePart = filePartitions.map(_._1).toSet
            val sequenceNumber     = partitionsFilePartCounter.getOrElseUpdate(partitionsFilePart, -1) + 1
            partitionsFilePartCounter += partitionsFilePart → sequenceNumber

            val sequenceNumberFormatted = sequenceNumber.toString

            fileSystem.rename(
              file.getPath,
              new Path(path, fileName.get(filePartitions.map(_._2), sequenceNumberFormatted))
            )
          } else {
            fileSystem.rename(file.getPath, new Path(path, file.getPath.getName))
          }
        }

        fileSystem.delete(new Path(pathTmp), true)
      }
    }

  }

  implicit class FixedFileFormatSpark(val spark: SparkSession) {

    def readFixedFile(schema: FFSchemaRecord, path: String): DataFrame = {
      val fixedFormatDf = spark.read
        .option("schema", schema)
        .format("io.prophecy.libs.FixedFileFormat")
        .load(path)

      fixedFormatDf
    }

  }

}

class FixedFileFormat extends FileFormat with DataSourceRegister with Serializable {
  private val logger = LoggerFactory.getLogger(classOf[FixedFileFormat])

  final val BUFFER_SIZE = 4096

  override def shortName(): String = "fixedFormat"

  override def toString: String = "FixedFormat"

  override def inferSchema(
    sparkSession: SparkSession,
    options:      Map[String, String],
    files:        Seq[FileStatus]
  ): Option[StructType] =
    Some(Json.parse(options("schema")).as[FFSchemaRecord].toSpark)

  /**
    * Method to convert input ByteArray into zoned decimal type string for both ascii and ebcdic charset. This method
    * will also return if input byte array represents negative or positive number.
    *
    * More information at https://github.com/SimpleDataLabsInc/prophecy/issues/662
    *
    * @param bufferSlice
    * @param recordType
    * @return
    */
  private def fetchNumberStringWithSign(bufferSlice: Array[Byte], recordType: String): (String, Boolean) = {
    val numberDigitsCharArray =
      new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(recordType)).trim.toCharArray
    val signByte = bufferSlice(bufferSlice.length - 1)
    val (lastDigit, isNegative): (Int, Boolean) = recordType match {
      case "ebcdic" ⇒
        if (signByte >= 0xc0.toByte && signByte <= 0xc9.toByte) (signByte - 0xc0.toByte, false)
        else if (signByte >= 0xd0.toByte && signByte <= 0xd9.toByte) (signByte - 0xd0.toByte, true)
        else (0.toByte,                                                                       false)
      case "ascii" ⇒
        if (signByte >= 0x30.toByte && signByte <= 0x39.toByte) (signByte - 0x30.toByte, false)
        else if (signByte >= 0x70.toByte && signByte <= 0x79.toByte) (signByte - 0x70.toByte, true)
        else (0.toByte,                                                                       false)
    }
    numberDigitsCharArray(numberDigitsCharArray.length - 1) = (0x30 + lastDigit).toByte.toChar
    (new String(numberDigitsCharArray), isNegative)
  }

  /**
    * Based on [[https://github.com/apache/spark/blob/branch-2.2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala]]
    */
  override def buildReader(
    sparkSession:    SparkSession,
    dataSchema:      StructType,
    partitionSchema: StructType,
    requiredSchema:  StructType,
    filters:         Seq[org.apache.spark.sql.sources.Filter],
    options:         Map[String, String],
    hadoopConf:      Configuration
  ): PartitionedFile ⇒ Iterator[InternalRow] = { file: PartitionedFile ⇒
    val ffSchema       = Json.parse(options("schema")).as[FFSchemaRecord]
    val columnValueMap = mutable.Map[String, Any]()
    // if type packed, then first 3 bytes are reserved per record for
    // metadata. the semantics of the aforesaid is not yet established!
    val length = (if (ffSchema.recordType == "packed") 3 else 0) + SchemaUtils.findColumnLengthSum(ffSchema)

    val requiredColumns = requiredSchema.fields.map(_.name.toLowerCase()).toSet
    val lookupColumns   = requiredSchema.fields.map(_.name.toLowerCase())
    val requiredSchemaIdMap = ffSchema.rows
      .map {
        case FFSimpleSchemaRow(name,           format, value) ⇒ name
        case FFCompoundSchemaRow(compound,     rows) ⇒ compound.name.get
        case FFConditionalSchemaRow(condition, schemaRow: FFCompoundSchemaRow) ⇒ schemaRow.compound.name.get
        case FFConditionalSchemaRow(condition, schemaRow: FFSimpleSchemaRow)   ⇒ schemaRow.name
      }
      .zipWithIndex
      .map { case (name, idx) ⇒ idx → lookupColumns.indexOf(name.toLowerCase()) }
      .toMap

    @transient val transientHadoopConf = new Configuration
    val inStream                       = CodecStreams.createInputStream(transientHadoopConf, new Path(file.filePath))

    // method to stream bytes
    def nextReadBuf(): Stream[Array[Byte]] = {
      val buffer = new Array[Byte](BUFFER_SIZE)
      // read into the buffer
      val in = inStream.read(buffer)
      // if reached the end of stream, announce it
      // else return newly copied bytes
      if (in == -1) Stream.empty
      else
        buffer.slice(0, in) #:: nextReadBuf
    }

    def decodeFile(
      rowIndex: Int,
      bytes:    Array[Byte],
      itr:      Iterator[Array[Byte]]
    ): Stream[GenericInternalRow] = {
      if (bytes.isEmpty && !itr.hasNext) Stream.empty
      else {
        val (row, slicedBytes) = {
          // get enough bytes if not already present and in doing so
          // calculate the current capacity (compare with fully possible
          // length)
          var nBytes: Array[Byte] = bytes
          while (nBytes.length < 3 * length && itr.hasNext)
            nBytes = nBytes ++ itr.next()
          if (nBytes.nonEmpty)
            generateRow(ffSchema.recordType,
                        ffSchema.rows,
                        requiredSchemaIdMap,
                        requiredColumns,
                        lookupColumns,
                        requiredSchema,
                        nBytes,
                        rowIndex,
                        columnValueMap,
                        ""
            )
          else
            (new GenericInternalRow(0), nBytes)
        }

        // is this exit condition correct? test last row values in detail.
        if (bytes.isEmpty && slicedBytes.isEmpty && !itr.hasNext)
          row #:: Stream.empty
//        else if (bytes.isEmpty && slicedBytes.isEmpty)
//          Stream.empty
        else
          row #:: decodeFile(rowIndex + 1, slicedBytes, itr)
      }
    }

    // decode all Rows
    val itr = nextReadBuf().toIterator
    decodeFile(0, if (itr.hasNext) itr.next() else Array.empty[Byte], itr).toIterator
  }

  private def generateRowForSimpleSchema(
    start:           Int,
    format:          FFDataFormat,
    columnName:      String,
    parentName:      String,
    columnValueMap:  mutable.Map[String, Any],
    buffer:          Array[Byte],
    requiredColumns: Set[String],
    row:             GenericInternalRow,
    idx:             Int,
    recordType:      String,
    rowIndex:        Int
  ): Int = {
    val (bufferSlice, columnLength) = SchemaUtils.readRow(start, buffer, format, columnValueMap)
    if (requiredColumns.contains(columnName.toLowerCase())) {
      format match {
        case x @ FFNumberArrayFormat(name, precision, scale, Some(arraySizeInfo), miscProperties) ⇒
          val arrayLength = getLengthFromArraySizeInfo(arraySizeInfo, columnValueMap)
          val innerSchema = FFNumberFormat(name,                      precision, scale, miscProperties)
          var startIndex  = start
          val finalValue = (0 until arrayLength).map { curInd ⇒
            val childRow = new GenericInternalRow(1)
            startIndex = generateRowForSimpleSchema(startIndex,
                                                    innerSchema,
                                                    columnName,
                                                    parentName,
                                                    columnValueMap,
                                                    buffer,
                                                    Set[String](columnName.toLowerCase()),
                                                    childRow,
                                                    0,
                                                    recordType,
                                                    rowIndex
            )
            childRow.values.head
          }
          row(idx) = ArrayData.toArrayData(finalValue)
          columnValueMap.put(parentName + columnName, finalValue)
        case x @ FFStringArrayFormat(name, precision, Some(arraySizeInfo)) ⇒
          val arrayLength = getLengthFromArraySizeInfo(arraySizeInfo, columnValueMap)
          val innerSchema = FFStringFormat(name,                      precision)
          var startIndex  = start
          val finalValue = (0 until arrayLength).map { curInd ⇒
            val childRow = new GenericInternalRow(1)
            startIndex = generateRowForSimpleSchema(startIndex,
                                                    innerSchema,
                                                    columnName,
                                                    parentName,
                                                    columnValueMap,
                                                    buffer,
                                                    Set[String](columnName.toLowerCase()),
                                                    childRow,
                                                    0,
                                                    recordType,
                                                    rowIndex
            )
            childRow.getUTF8String(0)
          }
          row(idx) = ArrayData.toArrayData(finalValue)
          columnValueMap.put(parentName + columnName, finalValue)
        case x @ FFNumberFormat(FFTypeName("IntegerType", delimiter), Some(precision), scale, miscProperties)
            if (miscProperties
              .getOrElse("unsigned", false)
              .asInstanceOf[Boolean] && precision < 4) || (!miscProperties
              .getOrElse("unsigned", false)
              .asInstanceOf[Boolean] && precision < 8) ⇒
          val order =
            if (miscProperties.getOrElse("endian", "big").toString == "little") ByteOrder.LITTLE_ENDIAN
            else ByteOrder.BIG_ENDIAN
          val value =
            if (precision <= 1) bufferSlice.head.toInt
            else if (precision <= 2) ByteBuffer.wrap(bufferSlice).order(order).getShort.toInt
            else ByteBuffer.wrap(bufferSlice).order(order).getInt
          val finalValue =
            if (miscProperties.getOrElse("unsigned", false).asInstanceOf[Boolean] && value < 0)
              (value + Math.pow(2, precision * 8)).toInt
            else value
          row(idx) = finalValue
          columnValueMap.put(parentName + columnName, finalValue)
        // handling unsigned integers with length 4
        case x @ FFNumberFormat(FFTypeName("IntegerType", delimiter), Some(precision), scale, miscProperties)
            if miscProperties
              .getOrElse("unsigned", false)
              .asInstanceOf[Boolean] && precision < 8 ⇒
          val order =
            if (miscProperties.getOrElse("endian", "big").toString == "little") ByteOrder.LITTLE_ENDIAN
            else ByteOrder.BIG_ENDIAN
          val value =
            if (precision <= 1) bufferSlice.head.toLong
            else if (precision <= 2)
              ByteBuffer.wrap(bufferSlice).order(order).getShort.toLong
            else ByteBuffer.wrap(bufferSlice).order(order).getInt.toLong
          val finalValue =
            if (miscProperties.getOrElse("unsigned", false).asInstanceOf[Boolean] && value < 0)
              (value + Math.pow(2, precision * 8)).toLong
            else value
          row(idx) = finalValue
          columnValueMap.put(parentName + columnName, finalValue)
        case x @ FFNumberFormat(FFTypeName("LongType", delimiter), Some(precision), scale, miscProperties) ⇒
          val order =
            if (miscProperties.getOrElse("endian", "big").toString == "little") ByteOrder.LITTLE_ENDIAN
            else ByteOrder.BIG_ENDIAN
          val value =
            if (precision <= 1) bufferSlice.head.toLong
            else if (precision <= 2) ByteBuffer.wrap(bufferSlice).order(order).getShort.toLong
            else if (precision <= 4) ByteBuffer.wrap(bufferSlice).order(order).getInt.toLong
            else ByteBuffer.wrap(bufferSlice).order(order).getLong
          val finalValue =
            if (miscProperties.getOrElse("unsigned", false).asInstanceOf[Boolean] && value < 0)
              (value + Math.pow(2, precision * 8)).toLong
            else value
          row(idx) = finalValue
          columnValueMap.put(parentName + columnName, finalValue)
        case x @ FFNumberFormat(_, Some(precision), scale, miscProperties) ⇒
          val schemaRecordType =
            if (miscProperties.getOrElse("ebcdic", false).asInstanceOf[Boolean]) "ebcdic" else recordType
          if (x.miscProperties.contains("packed") && x.miscProperties("packed") == true) {
            val isUnsigned = miscProperties.getOrElse("unsigned", false).asInstanceOf[Boolean]
            val isStripped = miscProperties.getOrElse("stripped", false).asInstanceOf[Boolean]
            val curBytes =
              if (schemaRecordType == "ebcdic" && false)
                new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(schemaRecordType))
                  .getBytes()
              else bufferSlice
            val decimal = curBytes.zipWithIndex
              .map {
                case (byte, idx) ⇒
                  if (idx == bufferSlice.length - 1 && !isStripped) {
                    (((byte >> 4) & 0x0f), idx)
                  } else {
                    (((byte >> 4) & 0x0f) * 10 + (byte & 0x0f), idx)
                  }
              }
              .foldLeft(BigDecimal(0)) {
                case (sum, (number, idx)) ⇒
                  if (idx == bufferSlice.length - 1 && !isStripped) {
                    sum * 10 + number
                  } else {
                    sum * 100 + number
                  }
              } / Math.pow(10, scale.getOrElse(0).toDouble)

            val signedDecimal = if ((bufferSlice.last & 0x0f) == 13 && !isUnsigned) {
              -decimal
            } else {
              decimal
            }
            val finalValue = Decimal(signedDecimal, precision, scale.getOrElse(0))
            row(idx) = finalValue
            columnValueMap.put(parentName + columnName, finalValue)
          } else if (x.miscProperties.contains("zoned") && x.miscProperties("zoned") == true) {
            try {
              val (str, isNegative) = fetchNumberStringWithSign(bufferSlice, schemaRecordType)
              val strWithDot = scale
                .map(scale ⇒ s"${str.slice(0, str.length - scale)}.${str.slice(str.length - scale, str.length)}")
                .getOrElse(str)
              val finalValue =
                if (isNegative)
                  new Decimal().set(BigDecimal(StringUtils.stripStart(strWithDot, "0"))).unary_-
                else
                  new Decimal().set(BigDecimal(StringUtils.stripStart(strWithDot, "0")))
              row(idx) = finalValue
              columnValueMap.put(parentName + columnName, finalValue)
            } catch {
              case NonFatal(e) ⇒
                val stringValue =
                  new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(schemaRecordType))
                logger.debug(
                  s"Value being saved '$stringValue' for column: '$columnName' is not of decimal type: ${e.getMessage}"
                )
                columnValueMap.put(parentName + columnName, null)
                row.setNullAt(idx)
            }
          } else {
            val value =
              try {
                val str =
                  new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(schemaRecordType)).trim
                val strWithDot =
                  if (x.miscProperties.getOrElse("decimal_point", "Comma").toString == "Period") str
                  else
                    scale
                      .map(scale ⇒ s"${str.slice(0, str.length - scale)}.${str.slice(str.length - scale, str.length)}")
                      .getOrElse(str)

                new Decimal().set(BigDecimal(StringUtils.stripStart(strWithDot.trim, "0")))
              } catch {
                case NonFatal(e) ⇒
                  val stringValue =
                    new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(schemaRecordType))
                  logger.debug(
                    s"Value being saved '$stringValue' for column: '$columnName' is not of decimal type: ${e.getMessage}"
                  )
                  new Decimal().set(0)
              }

            try {
              columnValueMap.put(parentName + columnName, value)
              row(idx) = value
            } catch {
              case NonFatal(e) ⇒
                val stringValue =
                  new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(schemaRecordType))
                println(
                  s"Error produced when reading column '$columnName' and row '$rowIndex' with value: '$stringValue' as a number: '${e.getMessage}'"
                )
                columnValueMap.put(parentName + columnName, null)
                row.setNullAt(idx)
            }
          }
        case FFNumberFormat(FFTypeName("IntegerType", _), _, _, miscProperties) ⇒
          val schemaRecordType =
            if (miscProperties.getOrElse("ebcdic", false).asInstanceOf[Boolean]) "ebcdic" else recordType
          try {
            val finalValue =
              new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(schemaRecordType)).trim.toInt
            row(idx) = finalValue
            columnValueMap.put(parentName + columnName, finalValue)
          } catch {
            case e: NumberFormatException ⇒
              columnValueMap.put(parentName + columnName, null)
              row.setNullAt(idx)
          }
        case _ ⇒
          val miscProperties: Map[String, String] = format match {
            case FFStringFormat(name, precision, props) ⇒ props.getOrElse(Map())
            case FFDateFormat(name,   format,    miscProperties) ⇒
              miscProperties.map(entry ⇒ (entry._1, entry._2.toString))
            case FFDateTimeFormat(name, format, miscProperties) ⇒
              miscProperties.map(entry ⇒ (entry._1, entry._2.toString))
            case _ ⇒ Map()
          }
          val schemaRecordType =
            if (miscProperties.getOrElse("ebcdic", false).toString == "true") "ebcdic" else recordType
          val finalValue = UTF8String
            .fromString(new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(schemaRecordType)))
          row(idx) = finalValue
          columnValueMap.put(parentName + columnName, finalValue)
      }
    }

    start + columnLength
  }

  private def generateRowForCompoundSchema(
    curRow:         FFSchemaRow,
    buffer:         Array[Byte],
    start:          Int,
    requiredSchema: StructType,
    innerRows:      Seq[FFSchemaRow],
    rowIndex:       Int,
    columnValueMap: mutable.Map[String, Any],
    parentName:     String,
    columnName:     String,
    recordType:     String
  ): (GenericInternalRow, Int) = {
    val columnLength = SchemaUtils.findColumnLengthSum(curRow)
    val bufferSlice  = buffer.slice(start, start + columnLength)
    val curRequiredSchema =
      requiredSchema.fields.filter(_.name == columnName).head.dataType.asInstanceOf[StructType]
    val curRequiredColumns = curRequiredSchema.fields.map(_.name.toLowerCase()).toSet
    val curLookupColumns   = curRequiredSchema.fields.map(_.name.toLowerCase())
    val curRequiredSchemaIdMap = innerRows
      .map {
        case FFSimpleSchemaRow(name,           format, value) ⇒ name
        case FFCompoundSchemaRow(compound,     rows) ⇒ compound.name.get
        case FFConditionalSchemaRow(condition, schemaRow: FFCompoundSchemaRow) ⇒ schemaRow.compound.name.get
        case FFConditionalSchemaRow(condition, schemaRow: FFSimpleSchemaRow)   ⇒ schemaRow.name
      }
      .zipWithIndex
      .map { case (name, idx) ⇒ idx → curLookupColumns.indexOf(name.toLowerCase()) }
      .toMap
    (generateRow(
       if (recordType == "packed") "ascii" else recordType,
       innerRows,
       curRequiredSchemaIdMap,
       curRequiredColumns,
       curLookupColumns,
       curRequiredSchema,
       bufferSlice,
       rowIndex,
       columnValueMap,
       parentName + columnName + "."
     )._1,
     columnLength + start
    )
  }

  private def generateRow(
    recordType:      String,
    schemaRows:      Seq[FFSchemaRow],
    idMap:           Map[Int, Int],
    requiredColumns: Set[String],
    lookupColumns:   Array[String],
    requiredSchema:  StructType,
    buffer:          Array[Byte],
    rowIndex:        Int,
    columnValueMap:  mutable.Map[String, Any],
    parentName:      String
  ): (GenericInternalRow, Array[Byte]) = {
    val row = new GenericInternalRow(lookupColumns.length)
    val consumed = schemaRows.zipWithIndex.foldLeft(if (recordType == "packed") 3 else 0) {
      case (start,
            (curRow @ FFConditionalSchemaRow(condition, innerRow @ FFSimpleSchemaRow(columnName, format, defaultValue)),
             _idx
            )
          ) ⇒
        val idx            = idMap(_idx)
        val conditionValue = BooleanExpressionEvaluator.evaluateCondition(condition, columnValueMap)
        if (conditionValue) {
          generateRowForSimpleSchema(start,
                                     format,
                                     columnName,
                                     parentName,
                                     columnValueMap,
                                     buffer,
                                     requiredColumns,
                                     row,
                                     idx,
                                     recordType,
                                     rowIndex
          )
        } else {
          row.setNullAt(idx)
          start
        }
      case (start,
            (curRow @ FFConditionalSchemaRow(condition,
                                             innerRow @ FFCompoundSchemaRow(FFStructType(columnName), curRows)
             ),
             _idx
            )
          ) ⇒
        val idx            = idMap(_idx)
        val conditionValue = BooleanExpressionEvaluator.evaluateCondition(condition, columnValueMap)
        if (conditionValue) {
          val (rowValue, nextIndex) = generateRowForCompoundSchema(curRow,
                                                                   buffer,
                                                                   start,
                                                                   requiredSchema,
                                                                   curRows,
                                                                   rowIndex,
                                                                   columnValueMap,
                                                                   parentName,
                                                                   columnName,
                                                                   recordType
          )

          row(idx) = rowValue
          nextIndex
        } else {
          row.setNullAt(idx)
          start
        }
      case (start, (curRow @ FFCompoundSchemaRow(FFStructType(columnName), innerRows), _idx)) ⇒
        val idx = idMap(_idx)
        val (rowValue, nextIndex) = generateRowForCompoundSchema(curRow,
                                                                 buffer,
                                                                 start,
                                                                 requiredSchema,
                                                                 innerRows,
                                                                 rowIndex,
                                                                 columnValueMap,
                                                                 parentName,
                                                                 columnName,
                                                                 recordType
        )
        row(idx) = rowValue
        nextIndex
      case (start,
            (curRow @ FFConditionalSchemaRow(
               condition,
               innerRow @ FFCompoundSchemaRow(FFStructArrayType(columnName, Some(arraySizeInfo)), innerRows)
             ),
             _idx
            )
          ) ⇒
        val idx            = idMap(_idx)
        val conditionValue = BooleanExpressionEvaluator.evaluateCondition(condition, columnValueMap)
        if (conditionValue) {
          val arrayLength = getLengthFromArraySizeInfo(arraySizeInfo,     columnValueMap)
          val innerRow    = FFCompoundSchemaRow(FFStructType(columnName), innerRows)
          val innerSchema = requiredSchema.fields
            .filter(_.name == columnName)
            .head
            .dataType
            .asInstanceOf[ArrayType]
            .elementType
            .asInstanceOf[StructType]
          var startIndex = start
          val finalValue = (0 until arrayLength).map { curInd ⇒
            val (rowValue, nextIndex) = generateRowForCompoundSchema(
              innerRow,
              buffer,
              startIndex,
              StructType(List(StructField(columnName, StructType(innerSchema)))),
              innerRows,
              rowIndex,
              columnValueMap,
              parentName,
              columnName,
              recordType
            )
            startIndex = nextIndex
            rowValue
          }
          row(idx) = ArrayData.toArrayData(finalValue)
          startIndex
        } else {
          row.setNullAt(idx)
          start
        }
      case (start,
            (curRow @ FFCompoundSchemaRow(FFStructArrayType(columnName, Some(arraySizeInfo)), innerRows), _idx)
          ) ⇒
        val idx         = idMap(_idx)
        val arrayLength = getLengthFromArraySizeInfo(arraySizeInfo,     columnValueMap)
        val innerRow    = FFCompoundSchemaRow(FFStructType(columnName), innerRows)
        val innerSchema = requiredSchema.fields
          .filter(_.name == columnName)
          .head
          .dataType
          .asInstanceOf[ArrayType]
          .elementType
          .asInstanceOf[StructType]
        var startIndex = start
        val finalValue = (0 until arrayLength).map { curInd ⇒
          val (rowValue, nextIndex) = generateRowForCompoundSchema(
            innerRow,
            buffer,
            startIndex,
            StructType(List(StructField(columnName, StructType(innerSchema)))),
            innerRows,
            rowIndex,
            columnValueMap,
            parentName,
            columnName,
            recordType
          )
          startIndex = nextIndex
          rowValue
        }
        row(idx) = ArrayData.toArrayData(finalValue)
        startIndex
      case (start, (FFSimpleSchemaRow(columnName, format, defaultValue), _idx)) ⇒
        val idx = idMap(_idx)
        generateRowForSimpleSchema(start,
                                   format,
                                   columnName,
                                   parentName,
                                   columnValueMap,
                                   buffer,
                                   requiredColumns,
                                   row,
                                   idx,
                                   recordType,
                                   rowIndex
        )

    }

    // information on generated row AND the bytes it consumes
    // XXX is slicing the most optimal??
    row → buffer.slice(consumed, buffer.length)
  }

  override def prepareWrite(
    sparkSession: SparkSession,
    job:          Job,
    options:      Map[String, String],
    dataSchema:   StructType
  ): OutputWriterFactory = {
    new OutputWriterFactory {
      override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = {
        val ffSchema = Json.parse(options("schema")).as[FFSchemaRecord]
        new FixedFormatOutputWriter(path, context, ffSchema, dataSchema)
      }

      override def getFileExtension(context: TaskAttemptContext): String =
        ".bin" + CodecStreams.getCompressionExtension(context)
    }
  }
}

class FixedFormatOutputWriter(
  path:         String,
  context:      TaskAttemptContext,
  ffSchema:     FFSchemaRecord,
  fullDFSchema: StructType
) extends OutputWriter {
  private var outputStream: Option[OutputStream] = None
  private var schemaRows     = ffSchema.rows
  private var dfSchema       = fullDFSchema
  private var curParent      = ""
  private val columnValueMap = mutable.Map[String, Any]()

  override def write(row: InternalRow): Unit = {
    val os = outputStream.getOrElse {
      val newStream = CodecStreams.createOutputStream(context, new Path(path))
      outputStream = Some(newStream)
      newStream
    }

    def writeRows(rows: Seq[FFSchemaRow]): Unit = {
      rows.foreach {
        case FFSimpleSchemaRow(columnName, format, defaultValue) ⇒
          val columnNameWithParent = curParent + columnName
          val (dfField, dfIdx) = dfSchema.fields.zipWithIndex
            .find(_._1.name.toLowerCase() == columnName.toLowerCase())
            .getOrElse(
              throw new Exception(
                s"Missing column '$columnName' in the dataframe with the schema: '$dfSchema'"
              )
            )

          val length = SchemaUtils.fixedLength(format)
          val data = format match {
            case x @ FFStringArrayFormat(name, Some(precision), Some(arraySizeInfo)) ⇒
              val arrayLength = getLengthFromArraySizeInfo(arraySizeInfo, columnValueMap)
              val oldRows     = schemaRows
              val oldDFSchema = dfSchema
              schemaRows = FFSimpleSchemaRow(columnName,
                                             FFStringFormat(FFTypeName("StringType", None), Some(precision)),
                                             FFNoDefaultVal()
              ) :: Nil
              dfSchema =
                StructType(List(StructField(columnName, dfField.dataType.asInstanceOf[ArrayType].elementType, true)))
              val oldCurParent = curParent
              (0 until arrayLength).foreach { curId ⇒
                val childRow = new GenericInternalRow(1)
                childRow(0) = row.getArray(dfIdx).getUTF8String(curId) //.toSeq[String](StringType)(curId)
                write(childRow)
              }
              curParent = oldCurParent
              schemaRows = oldRows
              dfSchema = oldDFSchema
              Array.empty[Byte]
            case x @ FFNumberArrayFormat(name, Some(precision), scale, Some(arraySizeInfo), miscProperties) ⇒
              val arrayLength = getLengthFromArraySizeInfo(arraySizeInfo, columnValueMap)
              val oldRows     = schemaRows
              val oldDFSchema = dfSchema
              schemaRows = FFSimpleSchemaRow(columnName,
                                             FFNumberFormat(name, Some(precision), scale, miscProperties),
                                             FFNoDefaultVal()
              ) :: Nil
              dfSchema =
                StructType(List(StructField(columnName, dfField.dataType.asInstanceOf[ArrayType].elementType, true)))
              val oldCurParent = curParent
              (0 until arrayLength).foreach { curId ⇒
                val childRow = new GenericInternalRow(1)
                childRow(0) = if (name.name == "IntegerType") {
                  if (precision <= 1)
                    row.getArray(dfIdx).getByte(curId)
                  else if (
                    precision <= 2 || (precision <= 1 && miscProperties
                      .getOrElse("unsigned", false)
                      .asInstanceOf[Boolean])
                  ) row.getArray(dfIdx).getShort(curId)
                  else if (
                    precision <= 4 || (precision <= 2 && miscProperties
                      .getOrElse("unsigned", false)
                      .asInstanceOf[Boolean])
                  ) row.getArray(dfIdx).getInt(curId)
                  else if (
                    precision <= 8 || (precision <= 4 && miscProperties
                      .getOrElse("unsigned", false)
                      .asInstanceOf[Boolean])
                  ) row.getArray(dfIdx).getLong(curId)
                } else if (name.name == "LongType") row.getArray(dfIdx).getLong(curId)
                else row.getArray(dfIdx).getUTF8String(curId) //.toSeq[String](StringType)(curId)
                write(childRow)
              }
              curParent = oldCurParent
              schemaRows = oldRows
              dfSchema = oldDFSchema
              Array.empty[Byte]
            case x @ FFNumberFormat(FFTypeName("IntegerType", _), Some(precision), None, properties) ⇒
              val endian =
                if (properties.getOrElse("endian", "big").toString == "little") ByteOrder.LITTLE_ENDIAN
                else ByteOrder.BIG_ENDIAN
              var byteBuffer = ByteBuffer.allocate(length.getOrElse(4)).order(endian)

              if (properties.getOrElse("unsigned", false).asInstanceOf[Boolean]) {
                if (length.get <= 1) {
                  if (row.getLong(dfIdx) >= Math.pow(2, 8 * length.get - 1)) {
                    columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
                    byteBuffer = byteBuffer.put(
                      (row.getLong(dfIdx) - Math.pow(2, 8 * length.get)).toByte
                    )
                  } else {
                    columnValueMap.put(columnNameWithParent, row.getByte(dfIdx))
                    byteBuffer = byteBuffer.put(row.getByte(dfIdx))
                  }
                } else if (length.get <= 2) {
                  if (row.getLong(dfIdx) >= Math.pow(2, 8 * length.get - 1)) {
                    columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
                    byteBuffer = byteBuffer.putShort(
                      (row.getLong(dfIdx) - Math.pow(2, 8 * length.get)).toShort
                    )
                  } else {
                    columnValueMap.put(columnNameWithParent, row.getShort(dfIdx))
                    byteBuffer = byteBuffer.putShort(row.getShort(dfIdx))
                  }
                } else {
                  if (row.getLong(dfIdx) >= Math.pow(2, 8 * length.get - 1)) {
                    columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
                    byteBuffer = byteBuffer.putInt(
                      (row.getLong(dfIdx) - Math.pow(2, 8 * length.get)).toInt
                    )
                  } else {
                    columnValueMap.put(columnNameWithParent, row.getInt(dfIdx))
                    byteBuffer = byteBuffer.putInt(row.getInt(dfIdx))
                  }
                }
              } else {
                if (length.get <= 1) {
                  columnValueMap.put(columnNameWithParent, row.getByte(dfIdx))
                  byteBuffer = byteBuffer.put(row.getByte(dfIdx))
                } else if (length.get <= 2) {
                  columnValueMap.put(columnNameWithParent, row.getShort(dfIdx))
                  byteBuffer = byteBuffer.putShort(row.getShort(dfIdx))
                } else {
                  columnValueMap.put(columnNameWithParent, row.getInt(dfIdx))
                  byteBuffer = byteBuffer.putInt(row.getInt(dfIdx))
                }
              }
              byteBuffer.array()
            case x @ FFNumberFormat(FFTypeName("LongType", _), Some(precision), None, properties) ⇒
              val endian =
                if (properties.getOrElse("endian", "big").toString == "little") ByteOrder.LITTLE_ENDIAN
                else ByteOrder.BIG_ENDIAN
              var byteBuffer = ByteBuffer.allocate(length.getOrElse(8)).order(endian)

              if (properties.getOrElse("unsigned", false).asInstanceOf[Boolean]) {
                if (length.get <= 1) {
                  if (row.getLong(dfIdx) >= Math.pow(2, 8 * length.get - 1)) {
                    columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
                    byteBuffer = byteBuffer.put(
                      (row.getLong(dfIdx) - Math.pow(2, 8 * length.get)).toByte
                    )
                  } else {
                    columnValueMap.put(columnNameWithParent, row.getByte(dfIdx))
                    byteBuffer = byteBuffer.put(row.getByte(dfIdx))
                  }
                } else if (length.get <= 2) {
                  if (row.getLong(dfIdx) >= Math.pow(2, 8 * length.get - 1)) {
                    columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
                    byteBuffer = byteBuffer.putShort(
                      (row.getLong(dfIdx) - Math.pow(2, 8 * length.get)).toShort
                    )
                  } else {
                    columnValueMap.put(columnNameWithParent, row.getShort(dfIdx))
                    byteBuffer = byteBuffer.putShort(row.getShort(dfIdx))
                  }
                } else if (length.get <= 4) {
                  if (row.getLong(dfIdx) >= Math.pow(2, 8 * length.get - 1)) {
                    columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
                    byteBuffer = byteBuffer.putInt(
                      (row.getLong(dfIdx) - Math.pow(2, 8 * length.get)).toInt
                    )
                  } else {
                    columnValueMap.put(columnNameWithParent, row.getInt(dfIdx))
                    byteBuffer = byteBuffer.putInt(row.getInt(dfIdx))
                  }
                } else {
                  if (row.getLong(dfIdx) >= Math.pow(2, 8 * length.get - 1)) {
                    columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
                    byteBuffer = byteBuffer.putLong(
                      (row.getLong(dfIdx) - Math.pow(2, 8 * length.get)).toLong
                    )
                  } else {
                    columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
                    byteBuffer = byteBuffer.putLong(row.getLong(dfIdx))
                  }
                }
              } else {
                if (length.get <= 1) {
                  columnValueMap.put(columnNameWithParent, row.getByte(dfIdx))
                  byteBuffer = byteBuffer.put(row.getByte(dfIdx))
                } else if (length.get <= 2) {
                  columnValueMap.put(columnNameWithParent, row.getShort(dfIdx))
                  byteBuffer = byteBuffer.putShort(row.getShort(dfIdx))
                } else if (length.get <= 4) {
                  columnValueMap.put(columnNameWithParent, row.getInt(dfIdx))
                  byteBuffer = byteBuffer.putInt(row.getInt(dfIdx))
                } else {
                  columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
                  byteBuffer = byteBuffer.putLong(row.getLong(dfIdx))
                }
              }
              byteBuffer.array()
            case x @ FFNumberFormat(FFTypeName("DecimalType", _), Some(precision), scale, properties)
                if properties.getOrElse("packed", false).asInstanceOf[Boolean] ⇒
              val schemaRecordType =
                if (properties.getOrElse("ebcdic", false).asInstanceOf[Boolean]) "ebcdic" else ffSchema.recordType
              val columnField = dfSchema.fields
                .find(_.name.toLowerCase == columnName.toLowerCase)
                .getOrElse(throw new Exception(s"Column '$columnName' not found in the DataFrame's schema"))

              val bigDecimal = columnField.dataType match {
                case DecimalType() ⇒
                  val decimal = columnField.dataType.asInstanceOf[DecimalType]
                  columnValueMap.put(columnNameWithParent, row.getDecimal(dfIdx, precision, scale.getOrElse(0)))
                  row.getDecimal(dfIdx,                    precision,            scale.getOrElse(0)).toJavaBigDecimal
                case IntegerType ⇒
                  columnValueMap.put(columnNameWithParent, row.getInt(dfIdx))
                  java.math.BigDecimal.valueOf(row.getInt(dfIdx))
                case LongType ⇒
                  columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
                  java.math.BigDecimal.valueOf(row.getLong(dfIdx))
                case StringType ⇒
                  columnValueMap.put(columnNameWithParent, row.getString(dfIdx))
                  new java.math.BigDecimal(row.getString(dfIdx))
                case _ ⇒
                  columnValueMap.put(columnNameWithParent, row.getDouble(dfIdx))
                  java.math.BigDecimal.valueOf(row.getDouble(dfIdx))
              }
              bigDecimalToPackedBytes(
                bigDecimal,
                precision,
                scale.getOrElse(0),
                schemaRecordType,
                properties.getOrElse("unsigned", false).asInstanceOf[Boolean],
                properties.getOrElse("stripped", false).asInstanceOf[Boolean]
              )
            case x @ FFNumberFormat(FFTypeName("DecimalType", _), Some(precision), Some(scale), properties) ⇒
              val schemaRecordType =
                if (properties.getOrElse("ebcdic", false).asInstanceOf[Boolean]) "ebcdic" else ffSchema.recordType
              if (x.miscProperties.contains("zoned") && x.miscProperties("zoned") == true) {
                val chars =
                  if (!row.isNullAt(dfIdx)) {
                    columnValueMap.put(columnNameWithParent, row.getDecimal(dfIdx, precision, scale))
                    row.getDecimal(dfIdx,                    precision,            scale).toString.toCharArray.filter(_ != '.')
                  } else Array[Char]()
                if (length.isEmpty) {
                  charArrayToFixedBytes(chars, length, schemaRecordType, FixedBytesFormat.NumberType)
                } else {
                  charArrayToZonedFixedBytes(chars, length.get, schemaRecordType, '0')
                }

              } else {
                val appendChar =
                  if (schemaRecordType == "ebcdic") '0' else ' '
                val chars =
                  if (!row.isNullAt(dfIdx)) {
                    val stringValue =
                      if (scale == 0) {
                        columnValueMap.put(columnNameWithParent, row.getDecimal(dfIdx, precision, scale))
                        row.getDecimal(dfIdx,                    precision,            scale).toString()
                      } else if (precision <= 18) {
                        columnValueMap.put(columnNameWithParent, row.getDouble(dfIdx))
                        Decimal(BigDecimal(row.getDouble(dfIdx), MathContext.UNLIMITED), precision, scale).toString
                      } else {
                        columnValueMap.put(columnNameWithParent, row.getDouble(dfIdx))
                        row.getDouble(dfIdx).toString
                      }
                    if (x.miscProperties.getOrElse("decimal_point", "Comma").toString == "Comma")
                      stringValue.toCharArray.filter(_ != '.')
                    else stringValue.toCharArray
                  } else Array[Char]()

                charArrayToFixedBytes(chars, length, schemaRecordType, FixedBytesFormat.NumberType, appendChar)
              }
            case _ ⇒
              val (endian, isEbcdic) = format match {
                case FFNumberFormat(name, precision, scale, miscProperties) ⇒
                  val endian = miscProperties.getOrElse("endian", "big") match {
                    case "big" ⇒ ByteOrder.BIG_ENDIAN
                    case _     ⇒ ByteOrder.LITTLE_ENDIAN
                  }
                  (endian, miscProperties.getOrElse("ebcdic", false).asInstanceOf[Boolean])
                case _ ⇒ (ByteOrder.BIG_ENDIAN, false)
              }
              val schemaRecordType = if (isEbcdic) "ebcdic" else ffSchema.recordType
              dfField.dataType match {
                case DecimalType() ⇒
                  val decimal = dfField.dataType.asInstanceOf[DecimalType]
                  columnValueMap.put(columnNameWithParent, row.getDecimal(dfIdx, decimal.precision, decimal.scale))
                  val chars = row.getDecimal(dfIdx, decimal.precision, decimal.scale).toString.toCharArray
                  charArrayToFixedBytes(chars, length, schemaRecordType)
                case LongType ⇒
                  var byteBuffer = ByteBuffer.allocate(length.getOrElse(8)).order(endian)
                  if (length.get <= 1) {
                    columnValueMap.put(columnNameWithParent, row.getByte(dfIdx))
                    byteBuffer = byteBuffer.put(row.getByte(dfIdx))
                  } else if (length.get <= 2) {
                    columnValueMap.put(columnNameWithParent, row.getShort(dfIdx))
                    byteBuffer = byteBuffer.putShort(row.getShort(dfIdx))
                  } else if (length.get <= 4) {
                    columnValueMap.put(columnNameWithParent, row.getInt(dfIdx))
                    byteBuffer = byteBuffer.putInt(row.getInt(dfIdx))
                  } else {
                    columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
                    byteBuffer = byteBuffer.putLong(row.getLong(dfIdx))
                  }
                  byteBuffer.array()
                case IntegerType ⇒
                  var byteBuffer = ByteBuffer.allocate(length.getOrElse(4)).order(endian)
                  if (length.get <= 1) {
                    columnValueMap.put(columnNameWithParent, row.getByte(dfIdx))
                    byteBuffer = byteBuffer.put(row.getByte(dfIdx))
                  } else if (length.get <= 2) {
                    columnValueMap.put(columnNameWithParent, row.getShort(dfIdx))
                    byteBuffer = byteBuffer.putShort(row.getShort(dfIdx))
                  } else {
                    columnValueMap.put(columnNameWithParent, row.getInt(dfIdx))
                    byteBuffer = byteBuffer.putInt(row.getInt(dfIdx))
                  }
                  byteBuffer.array()
                case ShortType ⇒
                  var byteBuffer = ByteBuffer.allocate(length.getOrElse(2)).order(endian)
                  if (length.get <= 1) {
                    columnValueMap.put(columnNameWithParent, row.getByte(dfIdx))
                    byteBuffer = byteBuffer.put(row.getByte(dfIdx))
                  } else {
                    columnValueMap.put(columnNameWithParent, row.getShort(dfIdx))
                    byteBuffer = byteBuffer.putShort(row.getShort(dfIdx))
                  }
                  byteBuffer.array()
                case DoubleType ⇒
                  columnValueMap.put(columnNameWithParent, row.getDouble(dfIdx))
                  val chars = row.getDouble(dfIdx).toString.toCharArray
                  charArrayToFixedBytes(chars, length, schemaRecordType)
                // TODO we should add proper handling for all the potential data types here, if a data type is incorrect
                //      and we fallback to UTF8String, sometimes this causes unrecoverable Java SegVaults, due to
                //      UTF8String type using unsafe Java APIs !!
                case _ ⇒
                  val miscProperties: Map[String, String] = format match {
                    case FFStringFormat(name, precision, props) ⇒ props.getOrElse(Map())
                    case FFDateFormat(name,   format,    miscProperties) ⇒
                      miscProperties.map(entry ⇒ (entry._1, entry._2.toString))
                    case FFDateTimeFormat(name, format, miscProperties) ⇒
                      miscProperties.map(entry ⇒ (entry._1, entry._2.toString))
                    case _ ⇒ Map()
                  }
                  val schemaRecordType =
                    if (miscProperties.getOrElse("ebcdic", false).toString == "true") "ebcdic" else ffSchema.recordType
                  try columnValueMap.put(columnNameWithParent, row.getString(dfIdx))
                  catch {
                    case _ ⇒ println("cell value not fetched")
                  }
                  val chars = if (!row.isNullAt(dfIdx)) Try(row.getString(dfIdx).toCharArray).getOrElse {
                    val fields = dfSchema.fields.zipWithIndex
                      .filter(_._2 != dfIdx)
                      .map(field ⇒
                        Try(s"Field ${field._1.name}: ${row.getString(field._2)};")
                          .getOrElse(s"Empty Field ${field._1.name};")
                      )
                      .mkString("\n")

                    throw new Exception(s"Parsing of the column '$columnName' failed. Other fields: \n$fields")
                  }
                  else Array[Char]()
                  charArrayToFixedBytes(chars, length, schemaRecordType)
              }
          }
          if (data.nonEmpty)
            os.write(data)
        case FFCompoundSchemaRow(FFUnionType(Some(unionName)), compoundRows) ⇒
          dfSchema.fields.zipWithIndex
            .find(_._1.name.toLowerCase() == unionName.toLowerCase())
            .map {
              case (_, idx) ⇒
                val unionValue = row.getString(idx)
                compoundRows
                  .find {
                    case FFCompoundSchemaRow(FFStructType(potentialStruct), _) ⇒
                      potentialStruct == unionValue
                  }
                  .map {
                    case FFCompoundSchemaRow(FFStructType(_), structRows) ⇒
                      writeRows(structRows)
                  }
                  .getOrElse(throw new Exception(s"Couldn't find a struct in the union with the value: '$unionValue'"))
            }
            .getOrElse(throw new Exception(s"Couldn't find a column '$unionName' referenced by a compound union type"))
        case FFCompoundSchemaRow(FFStructType(columnName), rows) ⇒
          val (dfField, dfIdx) = dfSchema.fields.zipWithIndex
            .find(_._1.name.toLowerCase() == columnName.toLowerCase())
            .getOrElse(
              throw new Exception(
                s"Missing column '$columnName' in the dataframe with the schema: '$dfSchema'"
              )
            )
          val oldRows     = schemaRows
          val oldDFSchema = dfSchema
          schemaRows = rows
          dfSchema = dfField.dataType.asInstanceOf[StructType]
          val oldCurParent = curParent
          curParent = curParent + columnName + "."
          write(row.getStruct(dfIdx, rows.length))
          curParent = oldCurParent
          schemaRows = oldRows
          dfSchema = oldDFSchema
        case FFCompoundSchemaRow(FFStructArrayType(columnName, Some(arraySizeInfo)), rows) ⇒
          val (dfField, dfIdx) = dfSchema.fields.zipWithIndex
            .find(_._1.name.toLowerCase() == columnName.toLowerCase())
            .getOrElse(
              throw new Exception(
                s"Missing column '$columnName' in the dataframe with the schema: '$dfSchema'"
              )
            )
          val arrayLength = getLengthFromArraySizeInfo(arraySizeInfo, columnValueMap)
          val oldRows     = schemaRows
          val oldDFSchema = dfSchema
          schemaRows = rows
          dfSchema = dfField.dataType.asInstanceOf[ArrayType].elementType.asInstanceOf[StructType]
          val oldCurParent = curParent
          curParent = curParent + columnName + "."
          (0 until arrayLength).foreach { curId ⇒
            write(row.getArray(dfIdx).getStruct(curId, rows.length))
          }
          curParent = oldCurParent
          schemaRows = oldRows
          dfSchema = oldDFSchema
        case FFConditionalSchemaRow(condition, schemaRow) ⇒
          val conditionValue = BooleanExpressionEvaluator.evaluateCondition(condition, columnValueMap)
          if (conditionValue)
            writeRows(schemaRow :: Nil)
      }
    }

    writeRows(schemaRows)
  }

  private def bigDecimalToPackedBytes(
    bigDecimal: java.math.BigDecimal,
    precision:  Int,
    scale:      Int,
    recordType: String,
    isUnsigned: Boolean,
    isStripped: Boolean
  ): Array[Byte] = {
    var number =
      new BigInteger(bigDecimal.unscaledValue().abs().toByteArray).toString
    val numberWithoutLast = if (isStripped) number else number.substring(0, number.length - 1)
    val evenNumber = if (numberWithoutLast.length % 2 == 0) {
      numberWithoutLast
    } else {
      "0" + numberWithoutLast
    }

    val prefixBytes =
      Array.fill[Byte](
        Math.floor((precision.toDouble - numberWithoutLast.length) / 2).toInt - (if (!isStripped) 1 else 0)
      )(0)
    val bytes = evenNumber.toCharArray
      .grouped(2)
      .map { group ⇒
        val (high, low) = (group(0).asDigit, group(1).asDigit)
        ((high.toByte << 4) | low.toByte).toByte
      }
      .toArray
    val signNibble =
      if (isUnsigned) 0xf
      else if (bigDecimal.compareTo(BigDecimal(0).bigDecimal) >= 0) 0xc // 0 cannot be represented as "-0", thus >=
      else 0xd
    val byteSuffix = (number.last.toString.toInt.toByte << 4 | signNibble).toByte

    // Length: 10, number: 1,          prefixBytes: 4, bytes: 0, byteSuffix: 1
    // Length: 10, number: 1234567890, prefixBytes: 0, bytes: 5, byteSuffix: 1
    // Length: 9,  number: 1,          prefixBytes: 4, bytes: 0, byteSuffix: 1
    // Length: 9,  number: 123456789,  prefixBytes: 0, bytes: 5, byteSuffix: 1
    var finalBytes = prefixBytes ++ bytes ++ (if (!isStripped) Array(byteSuffix) else Nil)

    val result =
      if (recordType == "ebcdic") new String(finalBytes).getBytes(FixedFormatHelper.recordTypeToCharset(recordType))
      else finalBytes
    val totalResultBytes = precision / 2 + precision % 2

    finalBytes = Array.fill[Byte](totalResultBytes - finalBytes.length)(0) ++ finalBytes
    finalBytes
//    prefixBytes ++ bytes ++ Array(byteSuffix)
  }

  /**
    * Method to convert input decimal number in chars array into zoned decimal representation for both ebcdic and
    * ascii charset.
    *
    * More information at https://github.com/SimpleDataLabsInc/prophecy/issues/662
    *
    * @param chars
    * @param length
    * @param recordType
    * @param padding
    * @return
    */
  private def charArrayToZonedFixedBytes(
    chars:      Array[Char],
    length:     Int,
    recordType: String,
    padding:    Char
  ): Array[Byte] = {
    val baseHexaDecimalValue = recordType match {
      case "ebcdic" ⇒ 0xf0.toByte
      case "ascii"  ⇒ 0x30.toByte
    }
    val zonedDecimalBytes: Array[Byte] = Array.fill(length)(baseHexaDecimalValue)
    val inputCharLength = if (chars(0) == '-') chars.length - 1 else chars.length
    val newStartIndex   = Math.max(length - inputCharLength, 0)
    val oldStartIndex   = chars.length - inputCharLength
    for (
      (newIdx, oldIdx) ← (newStartIndex until length - 1)
        .zip(oldStartIndex until Math.min(chars.length - 1, length - 1))
    )
      zonedDecimalBytes(newIdx) = (baseHexaDecimalValue + 0x01 * (chars(oldIdx) - '0')).toByte
    zonedDecimalBytes(length - 1) = recordType match {
      case "ebcdic" ⇒
        if (chars(0) == '-') (0xd0 + 0x01 * (chars(chars.length - 1) - '0')).toByte
        else (0xc0 + 0x01 * (chars(chars.length - 1) - '0')).toByte
      case "ascii" ⇒
        if (chars(0) == '-') (0x70 + 0x01 * (chars(chars.length - 1) - '0')).toByte
        else (0x30 + 0x01 * (chars(chars.length - 1) - '0')).toByte
    }
    zonedDecimalBytes
  }

  private object FixedBytesFormat extends Enumeration {
    type FixedBytesFormat = Value
    val StringType, NumberType = Value
  }

  import FixedBytesFormat._

  private def charArrayToFixedBytes(
    chars:          Array[Char],
    length:         Option[Int],
    recordType:     String,
    format:         FixedBytesFormat = FixedBytesFormat.StringType,
    numberTypeChar: Char = '0'
  ): Array[Byte] = {
    val charsPadded = length match {
      case Some(length) ⇒
        format match {
          case StringType ⇒
            val charsPadded = Array.fill(length)(' ')

            for (idx ← 0 until Math.min(chars.length, length))
              charsPadded(idx) = chars(idx)

            charsPadded
          case NumberType ⇒
            val charsPadded = Array.fill(length)(numberTypeChar)
            val startIndex  = Math.max(length - chars.length, 0)
            for ((newIdx, oldIdx) ← (startIndex until length).zip(0 until Math.min(chars.length, length))) {
              val character = chars(oldIdx)
              if (character == '-') {
                if (numberTypeChar == '0') {
                  charsPadded(newIdx) = numberTypeChar
                  charsPadded(0) = '-'
                } else {
                  charsPadded(newIdx) = '-'
                }
              } else {
                charsPadded(newIdx) = character
              }
            }

            charsPadded
        }
      case None ⇒
        val charsPadded = Array.fill(chars.length + 1)('\0')
        for (idx ← chars.indices)
          charsPadded(idx) = chars(idx)
        charsPadded
    }

    String.copyValueOf(charsPadded).getBytes(FixedFormatHelper.recordTypeToCharset(recordType))
  }

  def close(): Unit   = outputStream.foreach(_.close())
  def path():  String = path
}

object FixedFormatHelper {
  def recordTypeToCharset(recordType: String): String = {
    recordType match {
      case "ebcdic" ⇒ "Cp1047"
      // ASCII works here as well for most of the characters, however ABI allows to also encode extended characters
      // that are out of the range of the standard ASCII.
      // There are many extensions to ASCII, seems like windows-1252 is the one that ABI uses
      // (based on trial and error).
      case "ascii" | "utf8" ⇒ "windows-1252"
      case "packed"         ⇒ "windows-1252"
      case _                ⇒ throw new Exception(s"Unsupported record type '$recordType'")
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy