Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
io.prophecy.libs.fixedFormat.scala Maven / Gradle / Ivy
/*
* ====================================================================
*
* PROPHECY CONFIDENTIAL
*
* Prophecy Inc
* All Rights Reserved.
*
* NOTICE: All information contained herein is, and remains
* the property of Prophecy Inc, the intellectual and technical concepts contained
* herein are proprietary to Prophecy Inc and may be covered by U.S. and Foreign Patents,
* patents in process, and are protected by trade secret or copyright law.
* Dissemination of this information or reproduction of this material
* is strictly forbidden unless prior written permission is obtained
* from Prophecy Inc.
*
* ====================================================================
*/
package io.prophecy.libs
import io.prophecy.abinitio.xfr.ast.{
CustomBody,
CustomFunctionDefinition,
CustomTransform,
OutAssignCustomStatement,
SimpleVariableCustomTerm
}
import io.prophecy.abinitio.xfr.parse.CustomCompiler
import io.prophecy.libs.FixedFormatSchemaImplicits._
import io.prophecy.libs.utils.{BooleanExpressionEvaluator, getLengthFromArraySizeInfo}
import org.apache.commons.lang.StringUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, LocatedFileStatus, Path}
import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.util.ArrayData
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.sources.DataSourceRegister
import org.apache.spark.sql.types.{DoubleType, _}
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.unsafe.types.UTF8String
import org.slf4j.LoggerFactory
import play.api.libs.json.Json
import java.io.OutputStream
import java.math.{BigInteger, MathContext}
import java.nio.{ByteBuffer, ByteOrder}
import scala.collection.mutable
import scala.util.Try
import scala.util.control.NonFatal
object FixedFileFormatImplicits {}
trait FixedFileFormatImplicits {
implicit class FixedFileFormatDataFrame(val dataFrame: DataFrame) {
def writeFixedFile(
schema: FFSchemaRecord,
path: String,
maxRecordsPerFile: Option[Int] = None,
partitionBy: Option[List[String]] = None,
fileName: Option[(List[String], String) ⇒ String] = None,
mode: SaveMode = SaveMode.ErrorIfExists
): Unit = {
val fixedSchema = Json.stringify(Json.toJson(schema))
if (maxRecordsPerFile.isEmpty && partitionBy.isEmpty && fileName.isEmpty) {
dataFrame.write
.mode(mode)
.option("schema", fixedSchema)
.format("io.prophecy.libs.FixedFileFormat")
.save(path)
} else {
assert(
partitionBy.isDefined && fileName.isDefined,
"FixedFormat error: Both parameters 'partitionBy' and 'fileName' must be present to write to custom file names.'"
)
val dfForPartitioning = partitionBy.get.foldLeft(dataFrame) {
case (df, column) ⇒ df.withColumn(s"_$column", col(column))
}
val pathTmp = path + "/tmp"
val dfWrite = dfForPartitioning.write
val dfWriteWithOptional = maxRecordsPerFile
.map { maxRecordsPerFile ⇒
dfWrite.option("maxRecordsPerFile", maxRecordsPerFile)
}
.getOrElse(dfWrite)
dfWriteWithOptional
.mode(mode)
.option("schema", fixedSchema)
.partitionBy(partitionBy.get.map("_" + _): _*)
.format("io.prophecy.libs.FixedFileFormat")
.save(pathTmp)
val fileSystem = FileSystem.get(dataFrame.sparkSession.sparkContext.hadoopConfiguration)
val filesIterator = fileSystem.listFiles(new Path(pathTmp), true)
val partitionsFilePartCounter = mutable.Map[Set[String], Int]()
val files = {
var files = List[LocatedFileStatus]()
while (filesIterator.hasNext) files = filesIterator.next() :: files
files
}
files.map { file ⇒
if (file.getPath.getName.endsWith(".bin")) {
val pathFileSplit = file.getPath.toString.split('/')
val filePartitions = partitionBy.get.zipWithIndex.map {
case (_, idx) ⇒
val pathPartPartition = pathFileSplit(pathFileSplit.length - 2 - idx)
(pathPartPartition, pathPartPartition.split('=').tail.mkString("="))
}
val partitionsFilePart = filePartitions.map(_._1).toSet
val sequenceNumber = partitionsFilePartCounter.getOrElseUpdate(partitionsFilePart, -1) + 1
partitionsFilePartCounter += partitionsFilePart → sequenceNumber
val sequenceNumberFormatted = sequenceNumber.toString
fileSystem.rename(
file.getPath,
new Path(path, fileName.get(filePartitions.map(_._2), sequenceNumberFormatted))
)
} else {
fileSystem.rename(file.getPath, new Path(path, file.getPath.getName))
}
}
fileSystem.delete(new Path(pathTmp), true)
}
}
}
implicit class FixedFileFormatSpark(val spark: SparkSession) {
def readFixedFile(schema: FFSchemaRecord, path: String): DataFrame = {
val fixedFormatDf = spark.read
.option("schema", schema)
.format("io.prophecy.libs.FixedFileFormat")
.load(path)
fixedFormatDf
}
}
}
class FixedFileFormat extends FileFormat with DataSourceRegister with Serializable {
private val logger = LoggerFactory.getLogger(classOf[FixedFileFormat])
final val BUFFER_SIZE = 4096
override def shortName(): String = "fixedFormat"
override def toString: String = "FixedFormat"
override def inferSchema(
sparkSession: SparkSession,
options: Map[String, String],
files: Seq[FileStatus]
): Option[StructType] =
Some(Json.parse(options("schema")).as[FFSchemaRecord].toSpark)
/**
* Method to convert input ByteArray into zoned decimal type string for both ascii and ebcdic charset. This method
* will also return if input byte array represents negative or positive number.
*
* More information at https://github.com/SimpleDataLabsInc/prophecy/issues/662
*
* @param bufferSlice
* @param recordType
* @return
*/
private def fetchNumberStringWithSign(bufferSlice: Array[Byte], recordType: String): (String, Boolean) = {
val numberDigitsCharArray =
new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(recordType)).trim.toCharArray
val signByte = bufferSlice(bufferSlice.length - 1)
val (lastDigit, isNegative): (Int, Boolean) = recordType match {
case "ebcdic" ⇒
if (signByte >= 0xc0.toByte && signByte <= 0xc9.toByte) (signByte - 0xc0.toByte, false)
else if (signByte >= 0xd0.toByte && signByte <= 0xd9.toByte) (signByte - 0xd0.toByte, true)
else (0.toByte, false)
case "ascii" ⇒
if (signByte >= 0x30.toByte && signByte <= 0x39.toByte) (signByte - 0x30.toByte, false)
else if (signByte >= 0x70.toByte && signByte <= 0x79.toByte) (signByte - 0x70.toByte, true)
else (0.toByte, false)
}
numberDigitsCharArray(numberDigitsCharArray.length - 1) = (0x30 + lastDigit).toByte.toChar
(new String(numberDigitsCharArray), isNegative)
}
/**
* Based on [[https://github.com/apache/spark/blob/branch-2.2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala]]
*/
override def buildReader(
sparkSession: SparkSession,
dataSchema: StructType,
partitionSchema: StructType,
requiredSchema: StructType,
filters: Seq[org.apache.spark.sql.sources.Filter],
options: Map[String, String],
hadoopConf: Configuration
): PartitionedFile ⇒ Iterator[InternalRow] = { file: PartitionedFile ⇒
val ffSchema = Json.parse(options("schema")).as[FFSchemaRecord]
val columnValueMap = mutable.Map[String, Any]()
// if type packed, then first 3 bytes are reserved per record for
// metadata. the semantics of the aforesaid is not yet established!
val length = (if (ffSchema.recordType == "packed") 3 else 0) + SchemaUtils.findColumnLengthSum(ffSchema)
val requiredColumns = requiredSchema.fields.map(_.name.toLowerCase()).toSet
val lookupColumns = requiredSchema.fields.map(_.name.toLowerCase())
val requiredSchemaIdMap = ffSchema.rows
.map {
case FFSimpleSchemaRow(name, format, value) ⇒ name
case FFCompoundSchemaRow(compound, rows) ⇒ compound.name.get
case FFConditionalSchemaRow(condition, schemaRow: FFCompoundSchemaRow) ⇒ schemaRow.compound.name.get
case FFConditionalSchemaRow(condition, schemaRow: FFSimpleSchemaRow) ⇒ schemaRow.name
}
.zipWithIndex
.map { case (name, idx) ⇒ idx → lookupColumns.indexOf(name.toLowerCase()) }
.toMap
@transient val transientHadoopConf = new Configuration
val inStream = CodecStreams.createInputStream(transientHadoopConf, new Path(file.filePath))
// method to stream bytes
def nextReadBuf(): Stream[Array[Byte]] = {
val buffer = new Array[Byte](BUFFER_SIZE)
// read into the buffer
val in = inStream.read(buffer)
// if reached the end of stream, announce it
// else return newly copied bytes
if (in == -1) Stream.empty
else
buffer.slice(0, in) #:: nextReadBuf
}
def decodeFile(
rowIndex: Int,
bytes: Array[Byte],
itr: Iterator[Array[Byte]]
): Stream[GenericInternalRow] = {
if (bytes.isEmpty && !itr.hasNext) Stream.empty
else {
val (row, slicedBytes) = {
// get enough bytes if not already present and in doing so
// calculate the current capacity (compare with fully possible
// length)
var nBytes: Array[Byte] = bytes
while (nBytes.length < 3 * length && itr.hasNext)
nBytes = nBytes ++ itr.next()
if (nBytes.nonEmpty)
generateRow(ffSchema.recordType,
ffSchema.rows,
requiredSchemaIdMap,
requiredColumns,
lookupColumns,
requiredSchema,
nBytes,
rowIndex,
columnValueMap,
""
)
else
(new GenericInternalRow(0), nBytes)
}
// is this exit condition correct? test last row values in detail.
if (bytes.isEmpty && slicedBytes.isEmpty && !itr.hasNext)
row #:: Stream.empty
// else if (bytes.isEmpty && slicedBytes.isEmpty)
// Stream.empty
else
row #:: decodeFile(rowIndex + 1, slicedBytes, itr)
}
}
// decode all Rows
val itr = nextReadBuf().toIterator
decodeFile(0, if (itr.hasNext) itr.next() else Array.empty[Byte], itr).toIterator
}
private def generateRowForSimpleSchema(
start: Int,
format: FFDataFormat,
columnName: String,
parentName: String,
columnValueMap: mutable.Map[String, Any],
buffer: Array[Byte],
requiredColumns: Set[String],
row: GenericInternalRow,
idx: Int,
recordType: String,
rowIndex: Int
): Int = {
val (bufferSlice, columnLength) = SchemaUtils.readRow(start, buffer, format, columnValueMap)
if (requiredColumns.contains(columnName.toLowerCase())) {
format match {
case x @ FFNumberArrayFormat(name, precision, scale, Some(arraySizeInfo), miscProperties) ⇒
val arrayLength = getLengthFromArraySizeInfo(arraySizeInfo, columnValueMap)
val innerSchema = FFNumberFormat(name, precision, scale, miscProperties)
var startIndex = start
val finalValue = (0 until arrayLength).map { curInd ⇒
val childRow = new GenericInternalRow(1)
startIndex = generateRowForSimpleSchema(startIndex,
innerSchema,
columnName,
parentName,
columnValueMap,
buffer,
Set[String](columnName.toLowerCase()),
childRow,
0,
recordType,
rowIndex
)
childRow.values.head
}
row(idx) = ArrayData.toArrayData(finalValue)
columnValueMap.put(parentName + columnName, finalValue)
case x @ FFStringArrayFormat(name, precision, Some(arraySizeInfo)) ⇒
val arrayLength = getLengthFromArraySizeInfo(arraySizeInfo, columnValueMap)
val innerSchema = FFStringFormat(name, precision)
var startIndex = start
val finalValue = (0 until arrayLength).map { curInd ⇒
val childRow = new GenericInternalRow(1)
startIndex = generateRowForSimpleSchema(startIndex,
innerSchema,
columnName,
parentName,
columnValueMap,
buffer,
Set[String](columnName.toLowerCase()),
childRow,
0,
recordType,
rowIndex
)
childRow.getUTF8String(0)
}
row(idx) = ArrayData.toArrayData(finalValue)
columnValueMap.put(parentName + columnName, finalValue)
case x @ FFNumberFormat(FFTypeName("IntegerType", delimiter), Some(precision), scale, miscProperties)
if (miscProperties
.getOrElse("unsigned", false)
.asInstanceOf[Boolean] && precision < 4) || (!miscProperties
.getOrElse("unsigned", false)
.asInstanceOf[Boolean] && precision < 8) ⇒
val order =
if (miscProperties.getOrElse("endian", "big").toString == "little") ByteOrder.LITTLE_ENDIAN
else ByteOrder.BIG_ENDIAN
val value =
if (precision <= 1) bufferSlice.head.toInt
else if (precision <= 2) ByteBuffer.wrap(bufferSlice).order(order).getShort.toInt
else ByteBuffer.wrap(bufferSlice).order(order).getInt
val finalValue =
if (miscProperties.getOrElse("unsigned", false).asInstanceOf[Boolean] && value < 0)
(value + Math.pow(2, precision * 8)).toInt
else value
row(idx) = finalValue
columnValueMap.put(parentName + columnName, finalValue)
// handling unsigned integers with length 4
case x @ FFNumberFormat(FFTypeName("IntegerType", delimiter), Some(precision), scale, miscProperties)
if miscProperties
.getOrElse("unsigned", false)
.asInstanceOf[Boolean] && precision < 8 ⇒
val order =
if (miscProperties.getOrElse("endian", "big").toString == "little") ByteOrder.LITTLE_ENDIAN
else ByteOrder.BIG_ENDIAN
val value =
if (precision <= 1) bufferSlice.head.toLong
else if (precision <= 2)
ByteBuffer.wrap(bufferSlice).order(order).getShort.toLong
else ByteBuffer.wrap(bufferSlice).order(order).getInt.toLong
val finalValue =
if (miscProperties.getOrElse("unsigned", false).asInstanceOf[Boolean] && value < 0)
(value + Math.pow(2, precision * 8)).toLong
else value
row(idx) = finalValue
columnValueMap.put(parentName + columnName, finalValue)
case x @ FFNumberFormat(FFTypeName("LongType", delimiter), Some(precision), scale, miscProperties) ⇒
val order =
if (miscProperties.getOrElse("endian", "big").toString == "little") ByteOrder.LITTLE_ENDIAN
else ByteOrder.BIG_ENDIAN
val value =
if (precision <= 1) bufferSlice.head.toLong
else if (precision <= 2) ByteBuffer.wrap(bufferSlice).order(order).getShort.toLong
else if (precision <= 4) ByteBuffer.wrap(bufferSlice).order(order).getInt.toLong
else ByteBuffer.wrap(bufferSlice).order(order).getLong
val finalValue =
if (miscProperties.getOrElse("unsigned", false).asInstanceOf[Boolean] && value < 0)
(value + Math.pow(2, precision * 8)).toLong
else value
row(idx) = finalValue
columnValueMap.put(parentName + columnName, finalValue)
case x @ FFNumberFormat(_, Some(precision), scale, miscProperties) ⇒
val schemaRecordType =
if (miscProperties.getOrElse("ebcdic", false).asInstanceOf[Boolean]) "ebcdic" else recordType
if (x.miscProperties.contains("packed") && x.miscProperties("packed") == true) {
val isUnsigned = miscProperties.getOrElse("unsigned", false).asInstanceOf[Boolean]
val isStripped = miscProperties.getOrElse("stripped", false).asInstanceOf[Boolean]
val curBytes =
if (schemaRecordType == "ebcdic" && false)
new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(schemaRecordType))
.getBytes()
else bufferSlice
val decimal = curBytes.zipWithIndex
.map {
case (byte, idx) ⇒
if (idx == bufferSlice.length - 1 && !isStripped) {
(((byte >> 4) & 0x0f), idx)
} else {
(((byte >> 4) & 0x0f) * 10 + (byte & 0x0f), idx)
}
}
.foldLeft(BigDecimal(0)) {
case (sum, (number, idx)) ⇒
if (idx == bufferSlice.length - 1 && !isStripped) {
sum * 10 + number
} else {
sum * 100 + number
}
} / Math.pow(10, scale.getOrElse(0).toDouble)
val signedDecimal = if ((bufferSlice.last & 0x0f) == 13 && !isUnsigned) {
-decimal
} else {
decimal
}
val finalValue = Decimal(signedDecimal, precision, scale.getOrElse(0))
row(idx) = finalValue
columnValueMap.put(parentName + columnName, finalValue)
} else if (x.miscProperties.contains("zoned") && x.miscProperties("zoned") == true) {
try {
val (str, isNegative) = fetchNumberStringWithSign(bufferSlice, schemaRecordType)
val strWithDot = scale
.map(scale ⇒ s"${str.slice(0, str.length - scale)}.${str.slice(str.length - scale, str.length)}")
.getOrElse(str)
val finalValue =
if (isNegative)
new Decimal().set(BigDecimal(StringUtils.stripStart(strWithDot, "0"))).unary_-
else
new Decimal().set(BigDecimal(StringUtils.stripStart(strWithDot, "0")))
row(idx) = finalValue
columnValueMap.put(parentName + columnName, finalValue)
} catch {
case NonFatal(e) ⇒
val stringValue =
new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(schemaRecordType))
logger.debug(
s"Value being saved '$stringValue' for column: '$columnName' is not of decimal type: ${e.getMessage}"
)
columnValueMap.put(parentName + columnName, null)
row.setNullAt(idx)
}
} else {
val value =
try {
val str =
new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(schemaRecordType)).trim
val strWithDot =
if (x.miscProperties.getOrElse("decimal_point", "Comma").toString == "Period") str
else
scale
.map(scale ⇒ s"${str.slice(0, str.length - scale)}.${str.slice(str.length - scale, str.length)}")
.getOrElse(str)
new Decimal().set(BigDecimal(StringUtils.stripStart(strWithDot.trim, "0")))
} catch {
case NonFatal(e) ⇒
val stringValue =
new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(schemaRecordType))
logger.debug(
s"Value being saved '$stringValue' for column: '$columnName' is not of decimal type: ${e.getMessage}"
)
new Decimal().set(0)
}
try {
columnValueMap.put(parentName + columnName, value)
row(idx) = value
} catch {
case NonFatal(e) ⇒
val stringValue =
new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(schemaRecordType))
println(
s"Error produced when reading column '$columnName' and row '$rowIndex' with value: '$stringValue' as a number: '${e.getMessage}'"
)
columnValueMap.put(parentName + columnName, null)
row.setNullAt(idx)
}
}
case FFNumberFormat(FFTypeName("IntegerType", _), _, _, miscProperties) ⇒
val schemaRecordType =
if (miscProperties.getOrElse("ebcdic", false).asInstanceOf[Boolean]) "ebcdic" else recordType
try {
val finalValue =
new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(schemaRecordType)).trim.toInt
row(idx) = finalValue
columnValueMap.put(parentName + columnName, finalValue)
} catch {
case e: NumberFormatException ⇒
columnValueMap.put(parentName + columnName, null)
row.setNullAt(idx)
}
case _ ⇒
val miscProperties: Map[String, String] = format match {
case FFStringFormat(name, precision, props) ⇒ props.getOrElse(Map())
case FFDateFormat(name, format, miscProperties) ⇒
miscProperties.map(entry ⇒ (entry._1, entry._2.toString))
case FFDateTimeFormat(name, format, miscProperties) ⇒
miscProperties.map(entry ⇒ (entry._1, entry._2.toString))
case _ ⇒ Map()
}
val schemaRecordType =
if (miscProperties.getOrElse("ebcdic", false).toString == "true") "ebcdic" else recordType
val finalValue = UTF8String
.fromString(new String(bufferSlice, FixedFormatHelper.recordTypeToCharset(schemaRecordType)))
row(idx) = finalValue
columnValueMap.put(parentName + columnName, finalValue)
}
}
start + columnLength
}
private def generateRowForCompoundSchema(
curRow: FFSchemaRow,
buffer: Array[Byte],
start: Int,
requiredSchema: StructType,
innerRows: Seq[FFSchemaRow],
rowIndex: Int,
columnValueMap: mutable.Map[String, Any],
parentName: String,
columnName: String,
recordType: String
): (GenericInternalRow, Int) = {
val columnLength = SchemaUtils.findColumnLengthSum(curRow)
val bufferSlice = buffer.slice(start, start + columnLength)
val curRequiredSchema =
requiredSchema.fields.filter(_.name == columnName).head.dataType.asInstanceOf[StructType]
val curRequiredColumns = curRequiredSchema.fields.map(_.name.toLowerCase()).toSet
val curLookupColumns = curRequiredSchema.fields.map(_.name.toLowerCase())
val curRequiredSchemaIdMap = innerRows
.map {
case FFSimpleSchemaRow(name, format, value) ⇒ name
case FFCompoundSchemaRow(compound, rows) ⇒ compound.name.get
case FFConditionalSchemaRow(condition, schemaRow: FFCompoundSchemaRow) ⇒ schemaRow.compound.name.get
case FFConditionalSchemaRow(condition, schemaRow: FFSimpleSchemaRow) ⇒ schemaRow.name
}
.zipWithIndex
.map { case (name, idx) ⇒ idx → curLookupColumns.indexOf(name.toLowerCase()) }
.toMap
(generateRow(
if (recordType == "packed") "ascii" else recordType,
innerRows,
curRequiredSchemaIdMap,
curRequiredColumns,
curLookupColumns,
curRequiredSchema,
bufferSlice,
rowIndex,
columnValueMap,
parentName + columnName + "."
)._1,
columnLength + start
)
}
private def generateRow(
recordType: String,
schemaRows: Seq[FFSchemaRow],
idMap: Map[Int, Int],
requiredColumns: Set[String],
lookupColumns: Array[String],
requiredSchema: StructType,
buffer: Array[Byte],
rowIndex: Int,
columnValueMap: mutable.Map[String, Any],
parentName: String
): (GenericInternalRow, Array[Byte]) = {
val row = new GenericInternalRow(lookupColumns.length)
val consumed = schemaRows.zipWithIndex.foldLeft(if (recordType == "packed") 3 else 0) {
case (start,
(curRow @ FFConditionalSchemaRow(condition, innerRow @ FFSimpleSchemaRow(columnName, format, defaultValue)),
_idx
)
) ⇒
val idx = idMap(_idx)
val conditionValue = BooleanExpressionEvaluator.evaluateCondition(condition, columnValueMap)
if (conditionValue) {
generateRowForSimpleSchema(start,
format,
columnName,
parentName,
columnValueMap,
buffer,
requiredColumns,
row,
idx,
recordType,
rowIndex
)
} else {
row.setNullAt(idx)
start
}
case (start,
(curRow @ FFConditionalSchemaRow(condition,
innerRow @ FFCompoundSchemaRow(FFStructType(columnName), curRows)
),
_idx
)
) ⇒
val idx = idMap(_idx)
val conditionValue = BooleanExpressionEvaluator.evaluateCondition(condition, columnValueMap)
if (conditionValue) {
val (rowValue, nextIndex) = generateRowForCompoundSchema(curRow,
buffer,
start,
requiredSchema,
curRows,
rowIndex,
columnValueMap,
parentName,
columnName,
recordType
)
row(idx) = rowValue
nextIndex
} else {
row.setNullAt(idx)
start
}
case (start, (curRow @ FFCompoundSchemaRow(FFStructType(columnName), innerRows), _idx)) ⇒
val idx = idMap(_idx)
val (rowValue, nextIndex) = generateRowForCompoundSchema(curRow,
buffer,
start,
requiredSchema,
innerRows,
rowIndex,
columnValueMap,
parentName,
columnName,
recordType
)
row(idx) = rowValue
nextIndex
case (start,
(curRow @ FFConditionalSchemaRow(
condition,
innerRow @ FFCompoundSchemaRow(FFStructArrayType(columnName, Some(arraySizeInfo)), innerRows)
),
_idx
)
) ⇒
val idx = idMap(_idx)
val conditionValue = BooleanExpressionEvaluator.evaluateCondition(condition, columnValueMap)
if (conditionValue) {
val arrayLength = getLengthFromArraySizeInfo(arraySizeInfo, columnValueMap)
val innerRow = FFCompoundSchemaRow(FFStructType(columnName), innerRows)
val innerSchema = requiredSchema.fields
.filter(_.name == columnName)
.head
.dataType
.asInstanceOf[ArrayType]
.elementType
.asInstanceOf[StructType]
var startIndex = start
val finalValue = (0 until arrayLength).map { curInd ⇒
val (rowValue, nextIndex) = generateRowForCompoundSchema(
innerRow,
buffer,
startIndex,
StructType(List(StructField(columnName, StructType(innerSchema)))),
innerRows,
rowIndex,
columnValueMap,
parentName,
columnName,
recordType
)
startIndex = nextIndex
rowValue
}
row(idx) = ArrayData.toArrayData(finalValue)
startIndex
} else {
row.setNullAt(idx)
start
}
case (start,
(curRow @ FFCompoundSchemaRow(FFStructArrayType(columnName, Some(arraySizeInfo)), innerRows), _idx)
) ⇒
val idx = idMap(_idx)
val arrayLength = getLengthFromArraySizeInfo(arraySizeInfo, columnValueMap)
val innerRow = FFCompoundSchemaRow(FFStructType(columnName), innerRows)
val innerSchema = requiredSchema.fields
.filter(_.name == columnName)
.head
.dataType
.asInstanceOf[ArrayType]
.elementType
.asInstanceOf[StructType]
var startIndex = start
val finalValue = (0 until arrayLength).map { curInd ⇒
val (rowValue, nextIndex) = generateRowForCompoundSchema(
innerRow,
buffer,
startIndex,
StructType(List(StructField(columnName, StructType(innerSchema)))),
innerRows,
rowIndex,
columnValueMap,
parentName,
columnName,
recordType
)
startIndex = nextIndex
rowValue
}
row(idx) = ArrayData.toArrayData(finalValue)
startIndex
case (start, (FFSimpleSchemaRow(columnName, format, defaultValue), _idx)) ⇒
val idx = idMap(_idx)
generateRowForSimpleSchema(start,
format,
columnName,
parentName,
columnValueMap,
buffer,
requiredColumns,
row,
idx,
recordType,
rowIndex
)
}
// information on generated row AND the bytes it consumes
// XXX is slicing the most optimal??
row → buffer.slice(consumed, buffer.length)
}
override def prepareWrite(
sparkSession: SparkSession,
job: Job,
options: Map[String, String],
dataSchema: StructType
): OutputWriterFactory = {
new OutputWriterFactory {
override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = {
val ffSchema = Json.parse(options("schema")).as[FFSchemaRecord]
new FixedFormatOutputWriter(path, context, ffSchema, dataSchema)
}
override def getFileExtension(context: TaskAttemptContext): String =
".bin" + CodecStreams.getCompressionExtension(context)
}
}
}
class FixedFormatOutputWriter(
path: String,
context: TaskAttemptContext,
ffSchema: FFSchemaRecord,
fullDFSchema: StructType
) extends OutputWriter {
private var outputStream: Option[OutputStream] = None
private var schemaRows = ffSchema.rows
private var dfSchema = fullDFSchema
private var curParent = ""
private val columnValueMap = mutable.Map[String, Any]()
override def write(row: InternalRow): Unit = {
val os = outputStream.getOrElse {
val newStream = CodecStreams.createOutputStream(context, new Path(path))
outputStream = Some(newStream)
newStream
}
def writeRows(rows: Seq[FFSchemaRow]): Unit = {
rows.foreach {
case FFSimpleSchemaRow(columnName, format, defaultValue) ⇒
val columnNameWithParent = curParent + columnName
val (dfField, dfIdx) = dfSchema.fields.zipWithIndex
.find(_._1.name.toLowerCase() == columnName.toLowerCase())
.getOrElse(
throw new Exception(
s"Missing column '$columnName' in the dataframe with the schema: '$dfSchema'"
)
)
val length = SchemaUtils.fixedLength(format)
val data = format match {
case x @ FFStringArrayFormat(name, Some(precision), Some(arraySizeInfo)) ⇒
val arrayLength = getLengthFromArraySizeInfo(arraySizeInfo, columnValueMap)
val oldRows = schemaRows
val oldDFSchema = dfSchema
schemaRows = FFSimpleSchemaRow(columnName,
FFStringFormat(FFTypeName("StringType", None), Some(precision)),
FFNoDefaultVal()
) :: Nil
dfSchema =
StructType(List(StructField(columnName, dfField.dataType.asInstanceOf[ArrayType].elementType, true)))
val oldCurParent = curParent
(0 until arrayLength).foreach { curId ⇒
val childRow = new GenericInternalRow(1)
childRow(0) = row.getArray(dfIdx).getUTF8String(curId) //.toSeq[String](StringType)(curId)
write(childRow)
}
curParent = oldCurParent
schemaRows = oldRows
dfSchema = oldDFSchema
Array.empty[Byte]
case x @ FFNumberArrayFormat(name, Some(precision), scale, Some(arraySizeInfo), miscProperties) ⇒
val arrayLength = getLengthFromArraySizeInfo(arraySizeInfo, columnValueMap)
val oldRows = schemaRows
val oldDFSchema = dfSchema
schemaRows = FFSimpleSchemaRow(columnName,
FFNumberFormat(name, Some(precision), scale, miscProperties),
FFNoDefaultVal()
) :: Nil
dfSchema =
StructType(List(StructField(columnName, dfField.dataType.asInstanceOf[ArrayType].elementType, true)))
val oldCurParent = curParent
(0 until arrayLength).foreach { curId ⇒
val childRow = new GenericInternalRow(1)
childRow(0) = if (name.name == "IntegerType") {
if (precision <= 1)
row.getArray(dfIdx).getByte(curId)
else if (
precision <= 2 || (precision <= 1 && miscProperties
.getOrElse("unsigned", false)
.asInstanceOf[Boolean])
) row.getArray(dfIdx).getShort(curId)
else if (
precision <= 4 || (precision <= 2 && miscProperties
.getOrElse("unsigned", false)
.asInstanceOf[Boolean])
) row.getArray(dfIdx).getInt(curId)
else if (
precision <= 8 || (precision <= 4 && miscProperties
.getOrElse("unsigned", false)
.asInstanceOf[Boolean])
) row.getArray(dfIdx).getLong(curId)
} else if (name.name == "LongType") row.getArray(dfIdx).getLong(curId)
else row.getArray(dfIdx).getUTF8String(curId) //.toSeq[String](StringType)(curId)
write(childRow)
}
curParent = oldCurParent
schemaRows = oldRows
dfSchema = oldDFSchema
Array.empty[Byte]
case x @ FFNumberFormat(FFTypeName("IntegerType", _), Some(precision), None, properties) ⇒
val endian =
if (properties.getOrElse("endian", "big").toString == "little") ByteOrder.LITTLE_ENDIAN
else ByteOrder.BIG_ENDIAN
var byteBuffer = ByteBuffer.allocate(length.getOrElse(4)).order(endian)
if (properties.getOrElse("unsigned", false).asInstanceOf[Boolean]) {
if (length.get <= 1) {
if (row.getLong(dfIdx) >= Math.pow(2, 8 * length.get - 1)) {
columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
byteBuffer = byteBuffer.put(
(row.getLong(dfIdx) - Math.pow(2, 8 * length.get)).toByte
)
} else {
columnValueMap.put(columnNameWithParent, row.getByte(dfIdx))
byteBuffer = byteBuffer.put(row.getByte(dfIdx))
}
} else if (length.get <= 2) {
if (row.getLong(dfIdx) >= Math.pow(2, 8 * length.get - 1)) {
columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
byteBuffer = byteBuffer.putShort(
(row.getLong(dfIdx) - Math.pow(2, 8 * length.get)).toShort
)
} else {
columnValueMap.put(columnNameWithParent, row.getShort(dfIdx))
byteBuffer = byteBuffer.putShort(row.getShort(dfIdx))
}
} else {
if (row.getLong(dfIdx) >= Math.pow(2, 8 * length.get - 1)) {
columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
byteBuffer = byteBuffer.putInt(
(row.getLong(dfIdx) - Math.pow(2, 8 * length.get)).toInt
)
} else {
columnValueMap.put(columnNameWithParent, row.getInt(dfIdx))
byteBuffer = byteBuffer.putInt(row.getInt(dfIdx))
}
}
} else {
if (length.get <= 1) {
columnValueMap.put(columnNameWithParent, row.getByte(dfIdx))
byteBuffer = byteBuffer.put(row.getByte(dfIdx))
} else if (length.get <= 2) {
columnValueMap.put(columnNameWithParent, row.getShort(dfIdx))
byteBuffer = byteBuffer.putShort(row.getShort(dfIdx))
} else {
columnValueMap.put(columnNameWithParent, row.getInt(dfIdx))
byteBuffer = byteBuffer.putInt(row.getInt(dfIdx))
}
}
byteBuffer.array()
case x @ FFNumberFormat(FFTypeName("LongType", _), Some(precision), None, properties) ⇒
val endian =
if (properties.getOrElse("endian", "big").toString == "little") ByteOrder.LITTLE_ENDIAN
else ByteOrder.BIG_ENDIAN
var byteBuffer = ByteBuffer.allocate(length.getOrElse(8)).order(endian)
if (properties.getOrElse("unsigned", false).asInstanceOf[Boolean]) {
if (length.get <= 1) {
if (row.getLong(dfIdx) >= Math.pow(2, 8 * length.get - 1)) {
columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
byteBuffer = byteBuffer.put(
(row.getLong(dfIdx) - Math.pow(2, 8 * length.get)).toByte
)
} else {
columnValueMap.put(columnNameWithParent, row.getByte(dfIdx))
byteBuffer = byteBuffer.put(row.getByte(dfIdx))
}
} else if (length.get <= 2) {
if (row.getLong(dfIdx) >= Math.pow(2, 8 * length.get - 1)) {
columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
byteBuffer = byteBuffer.putShort(
(row.getLong(dfIdx) - Math.pow(2, 8 * length.get)).toShort
)
} else {
columnValueMap.put(columnNameWithParent, row.getShort(dfIdx))
byteBuffer = byteBuffer.putShort(row.getShort(dfIdx))
}
} else if (length.get <= 4) {
if (row.getLong(dfIdx) >= Math.pow(2, 8 * length.get - 1)) {
columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
byteBuffer = byteBuffer.putInt(
(row.getLong(dfIdx) - Math.pow(2, 8 * length.get)).toInt
)
} else {
columnValueMap.put(columnNameWithParent, row.getInt(dfIdx))
byteBuffer = byteBuffer.putInt(row.getInt(dfIdx))
}
} else {
if (row.getLong(dfIdx) >= Math.pow(2, 8 * length.get - 1)) {
columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
byteBuffer = byteBuffer.putLong(
(row.getLong(dfIdx) - Math.pow(2, 8 * length.get)).toLong
)
} else {
columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
byteBuffer = byteBuffer.putLong(row.getLong(dfIdx))
}
}
} else {
if (length.get <= 1) {
columnValueMap.put(columnNameWithParent, row.getByte(dfIdx))
byteBuffer = byteBuffer.put(row.getByte(dfIdx))
} else if (length.get <= 2) {
columnValueMap.put(columnNameWithParent, row.getShort(dfIdx))
byteBuffer = byteBuffer.putShort(row.getShort(dfIdx))
} else if (length.get <= 4) {
columnValueMap.put(columnNameWithParent, row.getInt(dfIdx))
byteBuffer = byteBuffer.putInt(row.getInt(dfIdx))
} else {
columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
byteBuffer = byteBuffer.putLong(row.getLong(dfIdx))
}
}
byteBuffer.array()
case x @ FFNumberFormat(FFTypeName("DecimalType", _), Some(precision), scale, properties)
if properties.getOrElse("packed", false).asInstanceOf[Boolean] ⇒
val schemaRecordType =
if (properties.getOrElse("ebcdic", false).asInstanceOf[Boolean]) "ebcdic" else ffSchema.recordType
val columnField = dfSchema.fields
.find(_.name.toLowerCase == columnName.toLowerCase)
.getOrElse(throw new Exception(s"Column '$columnName' not found in the DataFrame's schema"))
val bigDecimal = columnField.dataType match {
case DecimalType() ⇒
val decimal = columnField.dataType.asInstanceOf[DecimalType]
columnValueMap.put(columnNameWithParent, row.getDecimal(dfIdx, precision, scale.getOrElse(0)))
row.getDecimal(dfIdx, precision, scale.getOrElse(0)).toJavaBigDecimal
case IntegerType ⇒
columnValueMap.put(columnNameWithParent, row.getInt(dfIdx))
java.math.BigDecimal.valueOf(row.getInt(dfIdx))
case LongType ⇒
columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
java.math.BigDecimal.valueOf(row.getLong(dfIdx))
case StringType ⇒
columnValueMap.put(columnNameWithParent, row.getString(dfIdx))
new java.math.BigDecimal(row.getString(dfIdx))
case _ ⇒
columnValueMap.put(columnNameWithParent, row.getDouble(dfIdx))
java.math.BigDecimal.valueOf(row.getDouble(dfIdx))
}
bigDecimalToPackedBytes(
bigDecimal,
precision,
scale.getOrElse(0),
schemaRecordType,
properties.getOrElse("unsigned", false).asInstanceOf[Boolean],
properties.getOrElse("stripped", false).asInstanceOf[Boolean]
)
case x @ FFNumberFormat(FFTypeName("DecimalType", _), Some(precision), Some(scale), properties) ⇒
val schemaRecordType =
if (properties.getOrElse("ebcdic", false).asInstanceOf[Boolean]) "ebcdic" else ffSchema.recordType
if (x.miscProperties.contains("zoned") && x.miscProperties("zoned") == true) {
val chars =
if (!row.isNullAt(dfIdx)) {
columnValueMap.put(columnNameWithParent, row.getDecimal(dfIdx, precision, scale))
row.getDecimal(dfIdx, precision, scale).toString.toCharArray.filter(_ != '.')
} else Array[Char]()
if (length.isEmpty) {
charArrayToFixedBytes(chars, length, schemaRecordType, FixedBytesFormat.NumberType)
} else {
charArrayToZonedFixedBytes(chars, length.get, schemaRecordType, '0')
}
} else {
val appendChar =
if (schemaRecordType == "ebcdic") '0' else ' '
val chars =
if (!row.isNullAt(dfIdx)) {
val stringValue =
if (scale == 0) {
columnValueMap.put(columnNameWithParent, row.getDecimal(dfIdx, precision, scale))
row.getDecimal(dfIdx, precision, scale).toString()
} else if (precision <= 18) {
columnValueMap.put(columnNameWithParent, row.getDouble(dfIdx))
Decimal(BigDecimal(row.getDouble(dfIdx), MathContext.UNLIMITED), precision, scale).toString
} else {
columnValueMap.put(columnNameWithParent, row.getDouble(dfIdx))
row.getDouble(dfIdx).toString
}
if (x.miscProperties.getOrElse("decimal_point", "Comma").toString == "Comma")
stringValue.toCharArray.filter(_ != '.')
else stringValue.toCharArray
} else Array[Char]()
charArrayToFixedBytes(chars, length, schemaRecordType, FixedBytesFormat.NumberType, appendChar)
}
case _ ⇒
val (endian, isEbcdic) = format match {
case FFNumberFormat(name, precision, scale, miscProperties) ⇒
val endian = miscProperties.getOrElse("endian", "big") match {
case "big" ⇒ ByteOrder.BIG_ENDIAN
case _ ⇒ ByteOrder.LITTLE_ENDIAN
}
(endian, miscProperties.getOrElse("ebcdic", false).asInstanceOf[Boolean])
case _ ⇒ (ByteOrder.BIG_ENDIAN, false)
}
val schemaRecordType = if (isEbcdic) "ebcdic" else ffSchema.recordType
dfField.dataType match {
case DecimalType() ⇒
val decimal = dfField.dataType.asInstanceOf[DecimalType]
columnValueMap.put(columnNameWithParent, row.getDecimal(dfIdx, decimal.precision, decimal.scale))
val chars = row.getDecimal(dfIdx, decimal.precision, decimal.scale).toString.toCharArray
charArrayToFixedBytes(chars, length, schemaRecordType)
case LongType ⇒
var byteBuffer = ByteBuffer.allocate(length.getOrElse(8)).order(endian)
if (length.get <= 1) {
columnValueMap.put(columnNameWithParent, row.getByte(dfIdx))
byteBuffer = byteBuffer.put(row.getByte(dfIdx))
} else if (length.get <= 2) {
columnValueMap.put(columnNameWithParent, row.getShort(dfIdx))
byteBuffer = byteBuffer.putShort(row.getShort(dfIdx))
} else if (length.get <= 4) {
columnValueMap.put(columnNameWithParent, row.getInt(dfIdx))
byteBuffer = byteBuffer.putInt(row.getInt(dfIdx))
} else {
columnValueMap.put(columnNameWithParent, row.getLong(dfIdx))
byteBuffer = byteBuffer.putLong(row.getLong(dfIdx))
}
byteBuffer.array()
case IntegerType ⇒
var byteBuffer = ByteBuffer.allocate(length.getOrElse(4)).order(endian)
if (length.get <= 1) {
columnValueMap.put(columnNameWithParent, row.getByte(dfIdx))
byteBuffer = byteBuffer.put(row.getByte(dfIdx))
} else if (length.get <= 2) {
columnValueMap.put(columnNameWithParent, row.getShort(dfIdx))
byteBuffer = byteBuffer.putShort(row.getShort(dfIdx))
} else {
columnValueMap.put(columnNameWithParent, row.getInt(dfIdx))
byteBuffer = byteBuffer.putInt(row.getInt(dfIdx))
}
byteBuffer.array()
case ShortType ⇒
var byteBuffer = ByteBuffer.allocate(length.getOrElse(2)).order(endian)
if (length.get <= 1) {
columnValueMap.put(columnNameWithParent, row.getByte(dfIdx))
byteBuffer = byteBuffer.put(row.getByte(dfIdx))
} else {
columnValueMap.put(columnNameWithParent, row.getShort(dfIdx))
byteBuffer = byteBuffer.putShort(row.getShort(dfIdx))
}
byteBuffer.array()
case DoubleType ⇒
columnValueMap.put(columnNameWithParent, row.getDouble(dfIdx))
val chars = row.getDouble(dfIdx).toString.toCharArray
charArrayToFixedBytes(chars, length, schemaRecordType)
// TODO we should add proper handling for all the potential data types here, if a data type is incorrect
// and we fallback to UTF8String, sometimes this causes unrecoverable Java SegVaults, due to
// UTF8String type using unsafe Java APIs !!
case _ ⇒
val miscProperties: Map[String, String] = format match {
case FFStringFormat(name, precision, props) ⇒ props.getOrElse(Map())
case FFDateFormat(name, format, miscProperties) ⇒
miscProperties.map(entry ⇒ (entry._1, entry._2.toString))
case FFDateTimeFormat(name, format, miscProperties) ⇒
miscProperties.map(entry ⇒ (entry._1, entry._2.toString))
case _ ⇒ Map()
}
val schemaRecordType =
if (miscProperties.getOrElse("ebcdic", false).toString == "true") "ebcdic" else ffSchema.recordType
try columnValueMap.put(columnNameWithParent, row.getString(dfIdx))
catch {
case _ ⇒ println("cell value not fetched")
}
val chars = if (!row.isNullAt(dfIdx)) Try(row.getString(dfIdx).toCharArray).getOrElse {
val fields = dfSchema.fields.zipWithIndex
.filter(_._2 != dfIdx)
.map(field ⇒
Try(s"Field ${field._1.name}: ${row.getString(field._2)};")
.getOrElse(s"Empty Field ${field._1.name};")
)
.mkString("\n")
throw new Exception(s"Parsing of the column '$columnName' failed. Other fields: \n$fields")
}
else Array[Char]()
charArrayToFixedBytes(chars, length, schemaRecordType)
}
}
if (data.nonEmpty)
os.write(data)
case FFCompoundSchemaRow(FFUnionType(Some(unionName)), compoundRows) ⇒
dfSchema.fields.zipWithIndex
.find(_._1.name.toLowerCase() == unionName.toLowerCase())
.map {
case (_, idx) ⇒
val unionValue = row.getString(idx)
compoundRows
.find {
case FFCompoundSchemaRow(FFStructType(potentialStruct), _) ⇒
potentialStruct == unionValue
}
.map {
case FFCompoundSchemaRow(FFStructType(_), structRows) ⇒
writeRows(structRows)
}
.getOrElse(throw new Exception(s"Couldn't find a struct in the union with the value: '$unionValue'"))
}
.getOrElse(throw new Exception(s"Couldn't find a column '$unionName' referenced by a compound union type"))
case FFCompoundSchemaRow(FFStructType(columnName), rows) ⇒
val (dfField, dfIdx) = dfSchema.fields.zipWithIndex
.find(_._1.name.toLowerCase() == columnName.toLowerCase())
.getOrElse(
throw new Exception(
s"Missing column '$columnName' in the dataframe with the schema: '$dfSchema'"
)
)
val oldRows = schemaRows
val oldDFSchema = dfSchema
schemaRows = rows
dfSchema = dfField.dataType.asInstanceOf[StructType]
val oldCurParent = curParent
curParent = curParent + columnName + "."
write(row.getStruct(dfIdx, rows.length))
curParent = oldCurParent
schemaRows = oldRows
dfSchema = oldDFSchema
case FFCompoundSchemaRow(FFStructArrayType(columnName, Some(arraySizeInfo)), rows) ⇒
val (dfField, dfIdx) = dfSchema.fields.zipWithIndex
.find(_._1.name.toLowerCase() == columnName.toLowerCase())
.getOrElse(
throw new Exception(
s"Missing column '$columnName' in the dataframe with the schema: '$dfSchema'"
)
)
val arrayLength = getLengthFromArraySizeInfo(arraySizeInfo, columnValueMap)
val oldRows = schemaRows
val oldDFSchema = dfSchema
schemaRows = rows
dfSchema = dfField.dataType.asInstanceOf[ArrayType].elementType.asInstanceOf[StructType]
val oldCurParent = curParent
curParent = curParent + columnName + "."
(0 until arrayLength).foreach { curId ⇒
write(row.getArray(dfIdx).getStruct(curId, rows.length))
}
curParent = oldCurParent
schemaRows = oldRows
dfSchema = oldDFSchema
case FFConditionalSchemaRow(condition, schemaRow) ⇒
val conditionValue = BooleanExpressionEvaluator.evaluateCondition(condition, columnValueMap)
if (conditionValue)
writeRows(schemaRow :: Nil)
}
}
writeRows(schemaRows)
}
private def bigDecimalToPackedBytes(
bigDecimal: java.math.BigDecimal,
precision: Int,
scale: Int,
recordType: String,
isUnsigned: Boolean,
isStripped: Boolean
): Array[Byte] = {
var number =
new BigInteger(bigDecimal.unscaledValue().abs().toByteArray).toString
val numberWithoutLast = if (isStripped) number else number.substring(0, number.length - 1)
val evenNumber = if (numberWithoutLast.length % 2 == 0) {
numberWithoutLast
} else {
"0" + numberWithoutLast
}
val prefixBytes =
Array.fill[Byte](
Math.floor((precision.toDouble - numberWithoutLast.length) / 2).toInt - (if (!isStripped) 1 else 0)
)(0)
val bytes = evenNumber.toCharArray
.grouped(2)
.map { group ⇒
val (high, low) = (group(0).asDigit, group(1).asDigit)
((high.toByte << 4) | low.toByte).toByte
}
.toArray
val signNibble =
if (isUnsigned) 0xf
else if (bigDecimal.compareTo(BigDecimal(0).bigDecimal) >= 0) 0xc // 0 cannot be represented as "-0", thus >=
else 0xd
val byteSuffix = (number.last.toString.toInt.toByte << 4 | signNibble).toByte
// Length: 10, number: 1, prefixBytes: 4, bytes: 0, byteSuffix: 1
// Length: 10, number: 1234567890, prefixBytes: 0, bytes: 5, byteSuffix: 1
// Length: 9, number: 1, prefixBytes: 4, bytes: 0, byteSuffix: 1
// Length: 9, number: 123456789, prefixBytes: 0, bytes: 5, byteSuffix: 1
var finalBytes = prefixBytes ++ bytes ++ (if (!isStripped) Array(byteSuffix) else Nil)
val result =
if (recordType == "ebcdic") new String(finalBytes).getBytes(FixedFormatHelper.recordTypeToCharset(recordType))
else finalBytes
val totalResultBytes = precision / 2 + precision % 2
finalBytes = Array.fill[Byte](totalResultBytes - finalBytes.length)(0) ++ finalBytes
finalBytes
// prefixBytes ++ bytes ++ Array(byteSuffix)
}
/**
* Method to convert input decimal number in chars array into zoned decimal representation for both ebcdic and
* ascii charset.
*
* More information at https://github.com/SimpleDataLabsInc/prophecy/issues/662
*
* @param chars
* @param length
* @param recordType
* @param padding
* @return
*/
private def charArrayToZonedFixedBytes(
chars: Array[Char],
length: Int,
recordType: String,
padding: Char
): Array[Byte] = {
val baseHexaDecimalValue = recordType match {
case "ebcdic" ⇒ 0xf0.toByte
case "ascii" ⇒ 0x30.toByte
}
val zonedDecimalBytes: Array[Byte] = Array.fill(length)(baseHexaDecimalValue)
val inputCharLength = if (chars(0) == '-') chars.length - 1 else chars.length
val newStartIndex = Math.max(length - inputCharLength, 0)
val oldStartIndex = chars.length - inputCharLength
for (
(newIdx, oldIdx) ← (newStartIndex until length - 1)
.zip(oldStartIndex until Math.min(chars.length - 1, length - 1))
)
zonedDecimalBytes(newIdx) = (baseHexaDecimalValue + 0x01 * (chars(oldIdx) - '0')).toByte
zonedDecimalBytes(length - 1) = recordType match {
case "ebcdic" ⇒
if (chars(0) == '-') (0xd0 + 0x01 * (chars(chars.length - 1) - '0')).toByte
else (0xc0 + 0x01 * (chars(chars.length - 1) - '0')).toByte
case "ascii" ⇒
if (chars(0) == '-') (0x70 + 0x01 * (chars(chars.length - 1) - '0')).toByte
else (0x30 + 0x01 * (chars(chars.length - 1) - '0')).toByte
}
zonedDecimalBytes
}
private object FixedBytesFormat extends Enumeration {
type FixedBytesFormat = Value
val StringType, NumberType = Value
}
import FixedBytesFormat._
private def charArrayToFixedBytes(
chars: Array[Char],
length: Option[Int],
recordType: String,
format: FixedBytesFormat = FixedBytesFormat.StringType,
numberTypeChar: Char = '0'
): Array[Byte] = {
val charsPadded = length match {
case Some(length) ⇒
format match {
case StringType ⇒
val charsPadded = Array.fill(length)(' ')
for (idx ← 0 until Math.min(chars.length, length))
charsPadded(idx) = chars(idx)
charsPadded
case NumberType ⇒
val charsPadded = Array.fill(length)(numberTypeChar)
val startIndex = Math.max(length - chars.length, 0)
for ((newIdx, oldIdx) ← (startIndex until length).zip(0 until Math.min(chars.length, length))) {
val character = chars(oldIdx)
if (character == '-') {
if (numberTypeChar == '0') {
charsPadded(newIdx) = numberTypeChar
charsPadded(0) = '-'
} else {
charsPadded(newIdx) = '-'
}
} else {
charsPadded(newIdx) = character
}
}
charsPadded
}
case None ⇒
val charsPadded = Array.fill(chars.length + 1)('\0')
for (idx ← chars.indices)
charsPadded(idx) = chars(idx)
charsPadded
}
String.copyValueOf(charsPadded).getBytes(FixedFormatHelper.recordTypeToCharset(recordType))
}
def close(): Unit = outputStream.foreach(_.close())
def path(): String = path
}
object FixedFormatHelper {
def recordTypeToCharset(recordType: String): String = {
recordType match {
case "ebcdic" ⇒ "Cp1047"
// ASCII works here as well for most of the characters, however ABI allows to also encode extended characters
// that are out of the range of the standard ASCII.
// There are many extensions to ASCII, seems like windows-1252 is the one that ABI uses
// (based on trial and error).
case "ascii" | "utf8" ⇒ "windows-1252"
case "packed" ⇒ "windows-1252"
case _ ⇒ throw new Exception(s"Unsupported record type '$recordType'")
}
}
}