za.co.absa.cobrix.cobol.reader.VarLenReader.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cobol-parser_2.12 Show documentation
COBOL Reading and Import Extensions for Apache Spark
The newest version!
/*
 * Copyright 2018 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.cobrix.cobol.reader

import za.co.absa.cobrix.cobol.reader.index.entry.SparseIndexEntry
import za.co.absa.cobrix.cobol.reader.stream.SimpleStream

import scala.collection.mutable.ArrayBuffer

/** The abstract class for Cobol data readers from various sequential sources (e.g. variable size EBCDIC records) */
abstract class VarLenReader extends Reader with Serializable {

  /** Returns true if index generation is requested */
  def isIndexGenerationNeeded: Boolean

  /** Returns true if RDW header of variable length files is big endian */
  def isRdwBigEndian: Boolean

  /**
    * Returns a file iterator between particular offsets. This is for faster traversal of big binary files
    *
    * @param dataStream          A stream positioned at the beginning of the intended file portion to read
    * @param headerStream        A stream pointing to the beginning of the file, even if inputStream is pointing
    *                            to a record in the middle.
    * @param startingFileOffset  An offset of the file where parsing should be started
    * @param fileNumber          A file number uniquely identified a particular file of the data set
    * @param startingRecordIndex A starting record index of the data
    * @return An iterator of Spark Row objects
    *
    */
  def getRecordIterator(dataStream: SimpleStream,
                        headerStream: SimpleStream,
                        startingFileOffset: Long,
                        fileNumber: Int,
                        startingRecordIndex: Long): Iterator[Seq[Any]]

  /**
    * Traverses the data sequentially as fast as possible to generate record index.
    * This index will be used to distribute workload of the conversion.
    *
    * @param dataStream   A stream of input binary data
    * @param headerStream A stream pointing to the beginning of the file, even if inputStream is pointing
    *                     to a record in the middle.
    * @param fileNumber   A file number uniquely identified a particular file of the data set
    * @return An index of the file
    *
    */
  def generateIndex(dataStream: SimpleStream,
                    headerStream: SimpleStream,
                    fileNumber: Int,
                    isRdwBigEndian: Boolean): ArrayBuffer[SparseIndexEntry]
}