All Downloads are FREE. Search and download functionalities are using the official Maven repository.

za.co.absa.cobrix.spark.cobol.source.streaming.CobolStreamer.scala Maven / Gradle / Ivy

/*
 * Copyright 2018-2019 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.cobrix.spark.cobol.source.streaming

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import za.co.absa.cobrix.cobol.parser.decoders.StringTrimmingPolicy
import za.co.absa.cobrix.cobol.parser.encoding.codepage.CodePage
import za.co.absa.cobrix.spark.cobol.reader.fixedlen.{FixedLenNestedReader, FixedLenReader}
import za.co.absa.cobrix.spark.cobol.schema.SchemaRetentionPolicy
import za.co.absa.cobrix.spark.cobol.source.parameters.CobolParametersParser._
import za.co.absa.cobrix.spark.cobol.source.parameters.CobolParametersValidator

import scala.collection.JavaConverters.asScalaBufferConverter

/**
 * Provides an integration point for adding streaming support to the Spark-Cobol library.
 * 
 * This version is experimental and does not yet provide support to schemas or structured streaming.
 */
object CobolStreamer {
  
  def getReader(implicit ssc: StreamingContext): FixedLenReader = {
    val copybooks = Seq(loadCopybookFromHDFS(ssc.sparkContext.hadoopConfiguration, ssc.sparkContext.getConf.get(PARAM_COPYBOOK_PATH)))
    new FixedLenNestedReader(copybooks,
      isEbcdic = true,
      CodePage.getCodePageByName("common"),
      schemaRetentionPolicy = SchemaRetentionPolicy.CollapseRoot,
      stringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
      dropGroupFillers = true,
      nonTerminals = Seq()
    )
  }
  
  implicit class Deserializer(@transient val ssc: StreamingContext) extends Serializable {

    CobolParametersValidator.validateOrThrow(ssc.sparkContext.getConf, ssc.sparkContext.hadoopConfiguration)
    val reader = CobolStreamer.getReader(ssc)
    
    def cobolStream(): DStream[Row] = {
      ssc
        .binaryRecordsStream(ssc.sparkContext.getConf.get(PARAM_SOURCE_PATH), reader.getCobolSchema.getRecordSize)
        .flatMap(record => {          
          val it = reader.getRowIterator(record)
          for (parsedRecord <- it) yield {                        
            new GenericRowWithSchema(parsedRecord.toSeq.toArray, reader.getSparkSchema)
          }
        })
    }
  }

  private def loadCopybookFromHDFS(hadoopConfiguration: Configuration, copyBookHDFSPath: String): String = {
    val hdfs = FileSystem.get(hadoopConfiguration)
    val stream = hdfs.open(new Path(copyBookHDFSPath))
    try IOUtils.readLines(stream).asScala.mkString("\n") finally stream.close()
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy