
za.co.absa.cobrix.spark.cobol.source.index.IndexBuilder.scala Maven / Gradle / Ivy
/*
* Copyright 2018-2019 ABSA Group Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package za.co.absa.cobrix.spark.cobol.source.index
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.slf4j.LoggerFactory
import za.co.absa.cobrix.spark.cobol.reader.{Constants, Reader}
import za.co.absa.cobrix.spark.cobol.reader.index.entry.SparseIndexEntry
import za.co.absa.cobrix.spark.cobol.reader.varlen.VarLenReader
import za.co.absa.cobrix.spark.cobol.source.SerializableConfiguration
import za.co.absa.cobrix.spark.cobol.source.parameters.LocalityParameters
import za.co.absa.cobrix.spark.cobol.source.streaming.FileStreamer
import za.co.absa.cobrix.spark.cobol.source.types.FileWithOrder
import za.co.absa.cobrix.spark.cobol.utils.{HDFSUtils, SparkUtils}
/**
* Builds offsets indexes for distributed processing of variable-length records.
*
* The indexes creation tries to optimize locality by finding the HDFS blocks containing the records and instructing
* Spark to create the RDD partitions according to those locations.
*
* In a nutshell, ideally, there will be as many partitions as are there are indexes.
*/
private [source] object IndexBuilder {
private val logger = LoggerFactory.getLogger(this.getClass)
def buildIndex(filesList: Array[FileWithOrder], cobolReader: Reader, sqlContext: SQLContext)(localityParams: LocalityParameters): RDD[SparseIndexEntry] = {
cobolReader match {
case reader: VarLenReader => {
if (reader.isIndexGenerationNeeded && localityParams.improveLocality){
buildIndexForVarLenReaderWithFullLocality(filesList, reader, sqlContext)(localityParams.optimizeAllocation)
}
else {
buildIndexForVarLenReader(filesList, reader, sqlContext)
}
}
case _ => null
}
}
/**
* Builds the indexes by querying HDFS about the records locations and then asking Spark to assign local executors
* to those records in those locations.
*/
private def buildIndexForVarLenReaderWithFullLocality(filesList: Array[FileWithOrder], reader: VarLenReader, sqlContext: SQLContext)
(optimizeAllocation: Boolean): RDD[SparseIndexEntry] = {
val conf = sqlContext.sparkContext.hadoopConfiguration
val filesRDD = toRDDWithLocality(filesList, conf, sqlContext)
val sconf = new SerializableConfiguration(conf)
val indexes = filesRDD.mapPartitions(
partition => {
val fileSystem = FileSystem.get(sconf.value)
partition.flatMap(row => {
val filePath = row.filePath
val fileOrder = row.order
logger.info(s"Going to generate index for the file: $filePath")
val index = reader.generateIndex(new FileStreamer(filePath, fileSystem, 0, 0),
fileOrder, reader.isRdwBigEndian)
index.map(entry => {
val offset = if (entry.offsetFrom >= 0) entry.offsetFrom else 0
val length = getBlockLengthByIndexEntry(entry)
(entry, HDFSUtils.getBlocksLocations(new Path(filePath), offset, length, fileSystem))
})
}
)
})
logger.info("Going to collect located indexes into driver.")
val offsetsLocations: Seq[(SparseIndexEntry,Seq[String])] = if (optimizeAllocation) {
optimizeDistribution(indexes.collect(), sqlContext.sparkContext)
}
else {
indexes.collect()
}
logger.info(s"Creating RDD for ${offsetsLocations.length} located indexes.")
if (logger.isDebugEnabled) {
logger.debug("Preferred locations per index entry")
offsetsLocations.foreach(allocation => logger.debug(allocation.toString()))
}
sqlContext.sparkContext.makeRDD(offsetsLocations)
}
private def getBlockLengthByIndexEntry(entry: SparseIndexEntry): Long = {
val indexedLength = if (entry.offsetTo > 0) entry.offsetTo else Long.MaxValue
// Each entry of a sparse index can be slightly bigger than the default HDFS block size.
// The exact size depends on record size and root level boundaries between records.
// But overwhelming majority of these additional bytes will be less than 1 MB due to
// limitations on mainframe record sizes.
// We subtract 1 MB from indexed length to get locality nodes for the significant part
// of the block.
// In other words we don't care if the last megabyte is not node local as long as
// most of the split chunk is node local.
val significantLength = if (indexedLength < 10L*Constants.megabyte) {
indexedLength
} else {
indexedLength - Constants.megabyte
}
significantLength
}
/**
* Tries to balance the allocation among unused executors.
*/
private def optimizeDistribution(allocation: Seq[(SparseIndexEntry,Seq[String])], sc: SparkContext): Seq[(SparseIndexEntry,Seq[String])] = {
val availableExecutors = SparkUtils.currentActiveExecutors(sc)
logger.info(s"Trying to balance ${allocation.size} partitions among all available executors ($availableExecutors)")
LocationBalancer.balance(allocation, availableExecutors)
}
/**
* Converts the list of files into an RDD with preferred locations for the partitions.
*/
private def toRDDWithLocality(filesList: Array[FileWithOrder], conf: Configuration, sqlContext: SQLContext): RDD[FileWithOrder] = {
val fileSystem = FileSystem.get(conf)
val filesWithPreferredLocations = filesList.map(file => {
(file, HDFSUtils.getBlocksLocations(new Path(file.filePath), fileSystem))
}).toSeq
filesWithPreferredLocations.foreach(a => logger.debug(a.toString()))
sqlContext.sparkContext.makeRDD(filesWithPreferredLocations)
}
/**
* Builds records indexes. Does not take locality into account. Might be removed in further releases.
*/
def buildIndexForVarLenReader(filesList: Array[FileWithOrder], reader: VarLenReader, sqlContext: SQLContext): RDD[SparseIndexEntry] = {
val filesRDD = sqlContext.sparkContext.parallelize(filesList, filesList.length)
val conf = sqlContext.sparkContext.hadoopConfiguration
val sconf = new SerializableConfiguration(conf)
val indexes = filesRDD.mapPartitions(
partition => {
val fileSystem = FileSystem.get(sconf.value)
partition.flatMap(row => {
val filePath = row.filePath
val fileOrder = row.order
logger.info(s"Going to generate index for the file: $filePath")
val index = reader.generateIndex(new FileStreamer(filePath, fileSystem, 0, 0),
fileOrder, reader.isRdwBigEndian)
index
}
)
}).cache
val indexCount = indexes.count()
val numPartitions = Math.min(indexCount, Constants.maxNumPartitions).toInt
logger.warn(s"Index elements count: $indexCount, number of partitions = $numPartitions")
indexes.repartition(numPartitions).cache()
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy