za.co.absa.cobrix.cobol.reader.parameters.VariableLengthParameters.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cobol-parser_2.12 Show documentation
COBOL Reading and Import Extensions for Apache Spark
The newest version!
/*
 * Copyright 2018 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.cobrix.cobol.reader.parameters

/**
  * This class holds the parameters currently used for parsing variable-length records.
  *
  * @param isRecordSequence       Does input files have 4 byte record length headers
  * @param bdw                    Block descriptor word (if specified), for FB and VB record formats
  * @param isRdwBigEndian         Is RDW big endian? It may depend on flavor of mainframe and/or mainframe to PC transfer method
  * @param isRdwPartRecLength     Does RDW count itself as part of record length itself
  * @param rdwAdjustment          Controls a mismatch between RDW and record length
  * @param recordHeaderParser     An optional custom record header parser for non-standard RDWs
  * @param recordExtractor        An optional custom raw record parser class non-standard record types
  * @param rhpAdditionalInfo      An optional additional option string passed to a custom record header parser
  * @param reAdditionalInfo       An optional additional option string passed to a custom record extractor
  * @param recordLengthField      A field that stores record length
  * @param recordLengthMap        A mapping between field value and record size.
  * @param fileStartOffset        A number of bytes to skip at the beginning of each file
  * @param fileEndOffset          A number of bytes to skip at the end of each file
  * @param generateRecordId       Generate a sequential record number for each record to be able to retain the order of the original data
  * @param isUsingIndex           Is indexing input file before processing is requested
  * @param inputSplitSizeMB       A partition size to target. In certain circumstances this size may not be exactly that, but the library will do the best effort to target that size
  * @param inputSplitRecords      The number of records to include in each partition. Notice mainframe records may have variable size, inputSplitMB is the recommended option
  * @param improveLocality        Tries to improve locality by extracting preferred locations for variable-length records
  * @param optimizeAllocation     Optimizes cluster usage in case of optimization for locality in the presence of new nodes (nodes that do not contain any blocks of the files being processed)
  * @param inputFileNameColumn    A column name to add to the dataframe. The column will contain input file name for each record similar to 'input_file_name()' function
  */
case class VariableLengthParameters(
                                     isRecordSequence:      Boolean, // [deprecated by recordFormat]
                                     bdw:                   Option[Bdw],
                                     isRdwBigEndian:        Boolean,
                                     isRdwPartRecLength:    Boolean,
                                     rdwAdjustment:         Int,
                                     recordHeaderParser:    Option[String],
                                     recordExtractor:       Option[String],
                                     rhpAdditionalInfo:     Option[String],
                                     reAdditionalInfo:      String,
                                     recordLengthField:     String,
                                     recordLengthMap:       Map[String, Int],
                                     fileStartOffset:       Int,
                                     fileEndOffset:         Int,
                                     generateRecordId:      Boolean,
                                     isUsingIndex:          Boolean,
                                     inputSplitRecords:     Option[Int],
                                     inputSplitSizeMB:      Option[Int],
                                     improveLocality:       Boolean,
                                     optimizeAllocation:    Boolean,
                                     inputFileNameColumn:   String,
                                     occursMappings:        Map[String, Map[String, Int]]
                                   )