All Downloads are FREE. Search and download functionalities are using the official Maven repository.

za.co.absa.cobrix.spark.cobol.reader.VarLenReader.scala Maven / Gradle / Ivy

There is a newer version: 2.7.10
Show newest version
/*
 * Copyright 2018 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.cobrix.spark.cobol.reader

import org.apache.spark.sql.Row
import za.co.absa.cobrix.cobol.reader.index.entry.SparseIndexEntry
import za.co.absa.cobrix.cobol.reader.stream.SimpleStream

import scala.collection.mutable.ArrayBuffer


/** The abstract class for Cobol data readers from various sequential sources (e.g. variable size EBCDIC records) */
trait VarLenReader extends Reader with Serializable {

  /** Returns true if index generation is requested */
  def isIndexGenerationNeeded: Boolean

  /** Returns true if RDW header of variable length files is big endian */
  def isRdwBigEndian: Boolean

  /**
    * Returns a file iterator between particular offsets. This is for faster traversal of big binary files
    *
    * @param binaryData          A stream positioned at the beginning of the intended file portion to read
    * @param headerStream        A stream pointing to the beginning of the file, even if inputStream is pointing
    *                            to a record in the middle.
    * @param startingFileOffset  An offset of the file where parsing should be started
    * @param fileNumber          A file number uniquely identified a particular file of the data set
    * @param startingRecordIndex A starting record index of the data
    * @return An iterator of Spark Row objects
    *
    */
  @throws(classOf[Exception]) def getRowIterator(binaryData: SimpleStream,
                                                 headerStream: SimpleStream,
                                                 startingFileOffset: Long,
                                                 fileNumber: Int,
                                                 startingRecordIndex: Long): Iterator[Row]

  /**
    * Traverses the data sequentially as fast as possible to generate record index.
    * This index will be used to distribute workload of the conversion.
    *
    * @param dataStream   A stream of input binary data
    * @param headerStream A stream pointing to the beginning of the file, even if inputStream is pointing
    *                     to a record in the middle.
    * @param fileNumber   A file number uniquely identified a particular file of the data set
    * @return An index of the file
    *
    */
  @throws(classOf[Exception]) def generateIndex(dataStream: SimpleStream,
                                                headerStream: SimpleStream,
                                                fileNumber: Int,
                                                isRdwBigEndian: Boolean): ArrayBuffer[SparseIndexEntry]
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy