com.astrolabsoftware.sparkfits.FitsRecordReader.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2018 AstroLab Software
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.astrolabsoftware.sparkfits
// Scala dependencies
import scala.util.Try
import scala.collection.mutable
// Hadoop dependencies
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.hadoop.mapreduce.lib.input.FileSplit
// Spark dependencies
import org.apache.spark.sql.Row
// Logger
import org.apache.log4j.LogManager
// Internal dependencies
import com.astrolabsoftware.sparkfits.FitsLib.Fits
import com.astrolabsoftware.sparkfits.FitsLib.FITSBLOCK_SIZE_BYTES
/**
* Class to handle the relationship between executors & HDFS when reading a
* FITS file:
* File -> InputSplit -> RecordReader (this class) -> Mapper (executors)
* It extends the abstract class RecordReader from Hadoop.
* The idea behind is to describe the split of the FITS file
* in block and splits in HDFS. First the file is split into blocks
* in HDFS (physical blocks), whose size are given by Hadoop configuration
* (typically 128 MB). Then inside a block, the data is sent to executors
* record-by-record (logical split) of size < 128 MB.
* The purpose of this class is to describe the 2nd step, that is the split
* of blocks in records.
*
* The data is first read in chunks of binary data, then converted to the correct
* type element by element, and finally grouped into rows.
*
*/
class FitsRecordReader extends RecordReader[LongWritable, Seq[Row]] {
// Initialise mutable variables to be used by the executors
// Handle the HDFS block boundaries
private var splitStart: Long = 0L
private var splitEnd: Long = 0L
// Cursor position when reading the file
private var currentPosition: Long = 0L
// Size of the records to read from the file
private var recordLength: Int = 0
// Object to manipulate the fits file
private var fits: Fits = null
private var header: Array[String] = null
private var nrowsLong : Long = 0L
private var rowSizeInt : Int = 0
private var rowSizeLong : Long = 0L
private var nrowsPerImage : Long = 0L
private var startstop: FitsLib.FitsBlockBoundaries = FitsLib.FitsBlockBoundaries()
private var notValid : Boolean = false
// The (key, value) used to create the RDD
private var recordKey: LongWritable = null
private var recordValue: Seq[Row] = null
// Intermediate variable to store binary data
private var recordValueBytes: Array[Byte] = null
/**
* Close the file after reading it.
*/
override def close() {
if (fits.data != null) {
fits.data.close()
}
}
/**
* Get the current Key.
* @return (LongWritable) key.
*/
override def getCurrentKey: LongWritable = {
recordKey
}
/**
* Get the current Value.
* @return (Seq[Row]) Value is a list of heterogeneous lists. It will
* be converted to List[Row] later.
*/
override def getCurrentValue: Seq[Row] = {
recordValue
}
/**
* Fancy way of getting a process bar. Useful to know whether you have
* time for a coffee and a cigarette before the next run.
*
* @return (Float) progression inside a block.
*/
override def getProgress: Float = {
splitStart match {
case x if x == splitEnd => 0.0.toFloat
case _ => Math.min(
((currentPosition - splitStart) / (splitEnd - splitStart)).toFloat, 1.0
).toFloat
}
}
/**
* Here an executor will come and ask for a block of data
* by calling initialize(). Hadoop will split the data into records and
* those records will be sent. One needs then to know: the data file,
* the starting index of a split (byte index), the size of one record of
* data (byte), the ending index of a split (byte).
*
* Typically, a record must not be bigger than 1MB for the process to be
* efficient. Otherwise you will have a lot of Garbage collector call!
*
* @param inputSplit : (InputSplit)
* Represents the data to be processed by an individual Mapper.
* @param context : (TaskAttemptContext)
* Currently active context to access contextual information about
* running tasks.
* @return (Long) the current position of the pointer cursor in the file.
*
*/
override def initialize(inputSplit: InputSplit, context: TaskAttemptContext) {
val log = LogManager.getRootLogger
// Hadoop description of the input file (Path, split, start/stop indices).
val fileSplit = inputSplit.asInstanceOf[FileSplit]
// The actual file we will be reading from
val file = fileSplit.getPath
// Uncomment this to get ID identifying the InputSplit in the form
// hdfs://server.domain:8020/path/to/my/file:start+length
// println(fileSplit.toString)
// Hadoop Job configuration
val conf = context.getConfiguration
// Initialise our block (header + data)
fits = new Fits(file, conf, conf.get("hdu").toInt)
// Just get info on the primary block
val primaryfits = new Fits(file, conf, 0)
// Define the bytes indices of our block
// hdu_start=header_start, dataStart, dataStop, hdu_stop
startstop = fits.blockBoundaries
// Get the header
header = fits.blockHeader
val keyValues = FitsLib.parseHeader(header)
if (keyValues("NAXIS").toInt == 0 & conf.get("mode") == "PERMISSIVE") {
log.warn(s"Empty HDU for ${file}")
notValid = true
} else {
if (keyValues("NAXIS").toInt == 0 & conf.get("mode") == "FAILFAST") {
log.warn(s"Empty HDU for ${file}")
log.warn(s"Use option('mode', 'PERMISSIVE') if you want to discard all empty HDUs.")
}
// Total number of rows per image
nrowsPerImage = keyValues("NAXIS2").toInt
// Get the number of rows and the size (B) of one row.
// this is dependent on the HDU type
nrowsLong = fits.hdu.getNRows(keyValues)
rowSizeInt = fits.hdu.getSizeRowBytes(keyValues)
rowSizeLong = rowSizeInt.toLong
// What Hadoop gave us
val start_theo = fileSplit.getStart
val stop_theo = fileSplit.getStart + fileSplit.getLength
// Reject this mapper if the HDFS block is below the targeted HDU
notValid = if((start_theo < startstop.dataStart) && (stop_theo < startstop.dataStart)) {
true
} else if ((start_theo >= startstop.dataStop) && (stop_theo >= startstop.dataStop)) {
true
} else {
false
}
val splitStart_tmp = if (start_theo <= startstop.dataStart && !notValid) {
// Valid block: starting index.
// We are just before the targeted HDU, therefore
// we jump at the beginning of the data block
startstop.dataStart
} else {
start_theo
}
splitEnd = if (stop_theo <= startstop.dataStop && !notValid) {
// Valid block: ending index (start/end inside)
// We are inside the targeted HDU
stop_theo
} else if (stop_theo > startstop.dataStop && !notValid) {
// Valid block: ending index (start inside, end outside)
// The block start in the targeted HDU, but ends outside.
// We just move back the final cursor.
startstop.dataStop
} else {
// Not valid anyway
stop_theo
}
// A priori, there is no reason for a random split of the FITS file to start
// at the beginning of a row. Therefore we do the following:
// - The block starts
// - its data is processed record-by-record (see below for the
// processing of the records)
// - at the end of the block, the stop index might be in the middle of a
// row. We do not read this row in the first block, and we stop here.
// - The second block starts at start_1=(end_0)
// - We decrement the starting index to include the previous line not read
// in the first block.
// - its data is processed record-by-record
// - etc.
// Summary: Add last row if we start the block at the middle of a row.
// We assume that fileSplit.getStart starts at the
// beginning of the data block for the first valid block.
// We shift the start to where the data block starts
var shift = -startstop.dataStart
splitStart = if((splitStart_tmp) % rowSizeLong != 0 &&
splitStart_tmp != startstop.dataStart && splitStart_tmp != 0) {
// Decrement the starting index to fully catch the line we are sitting on
// Only do it if necessary, otherwise you'll get duplicate (#93)
var tmp_byte = 0
if ((splitStart_tmp + tmp_byte + shift) % rowSizeLong != 0) {
do {
tmp_byte = tmp_byte - 1
} while ((splitStart_tmp + tmp_byte + shift) % rowSizeLong != 0)
}
// Return offseted starting index
splitStart_tmp + tmp_byte
} else splitStart_tmp
// Get the record length in Bytes (get integer!). First look if the user
// specify a size for the recordLength. If not, set it to max(1 Ko, rowSize).
// If the HDU is an image, the recordLength is the row size (NAXIS1 * nbytes)
val recordLengthFromUser = Try{conf.get("recordlength").toInt}
.getOrElse{
if (fits.hduType == "IMAGE") {
rowSizeInt
} else {
// set it to max(1 Ko, rowSize)
math.max((1 * 1024 / rowSizeInt) * rowSizeInt, rowSizeInt)
}
}
// For Table, seek for a round number of lines for the record
recordLength = (recordLengthFromUser / rowSizeInt) * rowSizeInt
// Make sure that the recordLength is not bigger than the block size!
// This is a guard for small files.
recordLength = if ((recordLength / rowSizeInt) < nrowsLong.toInt) {
// OK less than the total number of lines
recordLength
} else {
// Small files, one record is the entire file.
nrowsLong.toInt * rowSizeLong.toInt
}
// Move to the starting binary index
fits.data.seek(splitStart)
// Set our starting block position
currentPosition = splitStart
}
}
/**
* Here you describe how the records are made, and the split data sent.
*
* @return (Boolean) true if the Mapper did not reach the end of the split.
* false otherwise.
*
*/
override def nextKeyValue() : Boolean = {
// Close the file if mapper is outside the HDU
if (notValid) {
fits.data.close()
return false
}
// Close the file if we went outside the block!
// This means we sent all our records.
if (fits.data.getPos >= startstop.dataStop) {
fits.data.close()
return false
}
// Initialise the key of the HDFS block
if (recordKey == null) {
recordKey = new LongWritable()
}
// the key is a linear index of the record, given by the
// position the record starts divided by the record length
recordKey.set(currentPosition / recordLength)
// The last record might not be of the same size as the other.
// So if recordLength goes above the end of the data block, cut it.
// If (getPos + recordLength) goes above splitEnd
recordLength = if ((startstop.dataStop - fits.data.getPos) < recordLength.toLong) {
(startstop.dataStop - fits.data.getPos).toInt
} else {
recordLength
}
// If (currentPosition + recordLength) goes above splitEnd
recordLength = if ((splitEnd - currentPosition) < recordLength.toLong) {
(splitEnd - currentPosition).toInt
} else {
recordLength
}
// Last record may not end at the end of a row.
// If record length is not a multiple of the row size
// This can only happen if one of the two ifs below have been triggered
// (by default recordLength is a multiple of the row size).
recordLength = if (recordLength % rowSizeLong != 0) {
// Decrement recordLength until we reach the end of the row n-1.
do {
recordLength = recordLength - 1
} while (recordLength % rowSizeLong != 0)
// Return
recordLength
} else recordLength
// If recordLength is below the size of a row
// skip and leave this row for the next block
if (recordLength < rowSizeLong) {
fits.data.close()
return false
}
// The array to place the binary data into
recordValueBytes = new Array[Byte](recordLength)
// read a record if the currentPosition is less than the split end
if (currentPosition < splitEnd) {
// Read a record of length `0 to recordLength - 1`
fits.data.readFully(recordValueBytes, 0, recordLength)
val imgPosition = (((currentPosition + recordLength)/rowSizeLong - 1)/nrowsPerImage).toLong
// Convert each row
// 1 task: 32 MB @ 2s
val tmp = Seq.newBuilder[Row]
for (i <- 0 to recordLength / rowSizeLong.toInt - 1) {
val myrow = fits.getRow(
recordValueBytes.slice(
rowSizeInt*i, rowSizeInt*(i+1)
)
)
val data = if (fits.hduType == "IMAGE") {
myrow :+ imgPosition
} else myrow
tmp += Row.fromSeq(data)
}
recordValue = tmp.result
// update our current position
currentPosition = currentPosition + recordLength
// we did not reach the end of the split, and we need to send more records
return true
}
// We reached the end of the split.
// We will now go to another split (if more available)
fits.data.close()
false
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy