com.astrolabsoftware.sparkfits.FitsSourceRelation.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2018 AstroLab Software
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.astrolabsoftware.sparkfits
import scala.util.Try
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.fs.RemoteIterator
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.LocatedFileStatus
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.sources.TableScan
import org.apache.spark.sql.sources.BaseRelation
import com.astrolabsoftware.sparkfits.FitsLib.Fits
import com.astrolabsoftware.sparkfits.FitsSchema.getSchema
import com.astrolabsoftware.sparkfits.FitsFileInputFormat._
/**
* Data Source API implementation for FITS.
* Note that for the moment, we provide support only for FITS table.
* We will add FITS image later on.
*
* The interpreter session below shows how to use basic functionalities:
*
* {{{
* scala> val fn = "src/test/resources/test_file.fits"
* scala> val df = spark.read
* .format("com.astrolabsoftware.sparkfits")
* .option("hdu", 1)
* .option("verbose", true)
* .load(fn)
* +------ HEADER (HDU=1) ------+
* XTENSION= BINTABLE / binary table extension
* BITPIX = 8 / array data type
* NAXIS = 2 / number of array dimensions
* NAXIS1 = 34 / length of dimension 1
* NAXIS2 = 20000 / length of dimension 2
* PCOUNT = 0 / number of group parameters
* GCOUNT = 1 / number of groups
* TFIELDS = 5 / number of table fields
* TTYPE1 = target
* TFORM1 = 10A
* TTYPE2 = RA
* TFORM2 = E
* TTYPE3 = Dec
* TFORM3 = D
* TTYPE4 = Index
* TFORM4 = K
* TTYPE5 = RunId
* TFORM5 = J
* END
* +----------------------------+
* df: org.apache.spark.sql.DataFrame = [target: string, RA: float ... 3 more fields]
*
* scala> df.printSchema
* root
* |-- target: string (nullable = true)
* |-- RA: float (nullable = true)
* |-- Dec: double (nullable = true)
* |-- Index: long (nullable = true)
* |-- RunId: integer (nullable = true)
*
* scala> df.show(5)
* +----------+---------+--------------------+-----+-----+
* | target| RA| Dec|Index|RunId|
* +----------+---------+--------------------+-----+-----+
* |NGC0000000| 3.448297| -0.3387486324784641| 0| 1|
* |NGC0000001| 4.493667| -1.4414990980543227| 1| 1|
* |NGC0000002| 3.787274| 1.3298379564211742| 2| 1|
* |NGC0000003| 3.423602|-0.29457151504987844| 3| 1|
* |NGC0000004|2.6619017| 1.3957536426732444| 4| 1|
* +----------+---------+--------------------+-----+-----+
* only showing top 5 rows
*
* }}}
*/
class FitsRelation(parameters: Map[String, String], userSchema: Option[StructType])(@transient val sqlContext: SQLContext)
extends BaseRelation with TableScan {
// Level of verbosity
var verbosity : Boolean = false
// Initialise Hadoop configuration
val conf = new Configuration(sqlContext.sparkContext.hadoopConfiguration)
// This will contain all options use to load the data
private[sparkfits] val extraOptions = new scala.collection.mutable.HashMap[String, String]
// Pre-load basic parameters for quick checks
val filePath = parameters.get("path") match {
case Some(x) => x
case None => sys.error("'path' must be specified.")
}
val indexHDU = parameters.get("hdu") match {
case Some(x) => x
case None => throw new NoSuchElementException("""
You need to specify the HDU to be read!
spark.readfits.option("hdu", )
""")
}
/**
* Search for input FITS files. The input path can be either a single
* FITS file, or a folder containing several FITS files with the
* same HDU structure, or a globbing structure e.g. "toto/\*.fits".
* Raise a NullPointerException if no files found.
*
* @param fn : (String)
* Input path.
* @return (List[String]) List with all files found.
*
*/
def searchFitsFile(fn: String): List[String] = {
// Make it Hadoop readable
val path = new Path(fn)
val fs = path.getFileSystem(conf)
// Check whether we are globbing
val isGlob : Boolean = Try{fs.globStatus(path).size > 1}.getOrElse(false)
val isCommaSep : Boolean = Try{fn.split(",").size > 1}.getOrElse(false)
// Check whether we want to load a single FITS file or several
val isDir : Boolean = fs.isDirectory(path)
val isFile : Boolean = fs.isFile(path)
// println(s"isDir=$isDir isFile=$isFile path=$path")
// List all the files
val listOfFitsFiles : List[String] = if (isGlob) {
val arr = fs.globStatus(path)
arr.map(x => x.getPath.toString).toList
} else if (isDir) {
val it = fs.listFiles(path, true)
getListOfFiles(it).filter{file => file.endsWith(".fits")}
} else if (isCommaSep) {
fn.split(",").toList
} else if (isFile){
List(fn)
} else {
List[String]()
}
// Check that we have at least one file
listOfFitsFiles.size match {
case x if x > 0 => if (verbosity) {
println("FitsRelation.searchFitsFile> Found " + listOfFitsFiles.size.toString + " file(s):")
listOfFitsFiles.foreach(println)
}
case x if x <= 0 => throw new NullPointerException(s"""
0 files detected! Is $fn a directory containing
FITS files or a FITS file?
""")
}
listOfFitsFiles
}
/**
* Load recursively all FITS file inside a directory.
*
* @param it : (RemoteIterator[LocatedFileStatus])
* Iterator from a Hadoop Path containing informations about files.
* @param extensions : (List[String)
* List of accepted extensions. Currently only .fits is available.
* Default is List("*.fits").
* @return List of files as a list of String.
*
*/
def getListOfFiles(it: RemoteIterator[LocatedFileStatus],
extensions: List[String] = List(".fits")): List[String] = {
if (!it.hasNext) {
Nil
} else {
it.next.getPath.toString :: getListOfFiles(it, extensions)
}
}
/**
* Check that the schemas of different FITS HDU to be added are
* the same. Throw an AssertionError otherwise.
* The check is performed only for BINTABLE.
*
* NOTE: This operation is very long for many files! Do not use it for
* hundreds of files!
*
* @param listOfFitsFiles : (List[String])
* List of files as a list of String.
* @return (String) the type of HDU: BINTABLE, IMAGE, EMPTY, or
* NOT UNDERSTOOD if not registered.
*
*/
def checkSchemaAndReturnType(listOfFitsFiles : List[String]): Boolean = {
// Targeted HDU
val indexHDU = conf.get("hdu").toInt
// Initialise
val path_init = new Path(listOfFitsFiles(0))
val fits_init = new Fits(path_init, conf, indexHDU)
if (fits_init.hdu.implemented) {
// Do not perform checks if the mode is PERMISSIVE.
if (conf.get("mode") != "PERMISSIVE") {
val schema_init = getSchema(fits_init)
fits_init.data.close()
for (file <- listOfFitsFiles.slice(1, listOfFitsFiles.size)) {
var path = new Path(file)
val fits = new Fits(path, conf, indexHDU)
val schema = getSchema(fits)
val isOk = schema_init == schema
isOk match {
case true => isOk
case false => {
throw new AssertionError(
"""
You are trying to add HDU data with different structures!
Check that the number of columns, names of columns and element
types are the same. re-run with .option("verbose", true) to
list all the files.
""")
}
}
fits.data.close()
}
}
true
} else {
println(s"""
FITS type ${fits_init.hduType} not supported yet.
An empty DataFrame will be returned.""")
false
}
}
/**
* Create a RDD[Row] from the data of one HDU.
* The input can be either the path to one FITS file (path + filename),
* or the path to a directory containing FITS files or a glob on a
* directory (*.fits). Needless to say that the FITS files must
* have the same structure, otherwise the union will be impossible.
* The format of the input must be a String with Hadoop format
* - (local) file://path/to/data
* - (HDFS) hdfs://://path/to/data
*
* If the HDU type is not "implemented", return an empty RDD[Row].
*
* NOTE: Schema check needs to be fixed!
*
* @param fn : (String)
* Filename of the fits file to be read, or a directory containing FITS files
* with the same HDU structure.
* @return (RDD[Row]) always one single RDD made from the HDU of
* one FITS file, or from the same kind of HDU from several FITS file.
* Empty if the HDU type is not a BINTABLE or IMAGE.
*
*/
def load(fn : String): RDD[Row] = {
val listOfFitsFiles = searchFitsFile(fn)
// Check that all the files have the same Schema
// in order to perform the union. Return the HDU type.
// NOTE: This operation is very long for hundreds of files!
// NOTE: Limit that to the first 10 files.
// NOTE: Need to be fixed!
val implemented = if (listOfFitsFiles.size < 10) {
checkSchemaAndReturnType(listOfFitsFiles)
} else{
checkSchemaAndReturnType(listOfFitsFiles.slice(0, 10))
}
// Load one or all the FITS files found
load(listOfFitsFiles, implemented)
}
/**
* Load the HDU data from several FITS file into a single RDD[Row].
* The structure of the HDU must be the same, that is contain the
* same number of columns with the same name and element types. Note that
* we pass the list of all the files to newAPIHadoopFile directly, and
* Spark (Hadoop) does the union on its own. So powerful...
*
* If the HDU type is not "implemented", return an empty RDD[Row].
*
* @param fns : (List[String])
* List of filenames with the same structure.
* @return (RDD[Row]) always one single RDD[Row] made from the HDU of
* one FITS file, or from the same kind of HDU from several FITS file.
* Empty if the HDU type is not a BINTABLE or IMAGE.
*
*/
def load(fns : List[String], implemented: Boolean): RDD[Row] = {
if (verbosity) {
// Check number of files
val nFiles = fns.size
println("NFILES: ", nFiles)
}
val rdd = if (implemented) {
// Distribute the table data
sqlContext.sparkContext.newAPIHadoopFile(fns.mkString(","),
classOf[FitsFileInputFormat],
classOf[LongWritable],
classOf[Seq[Row]],
conf).flatMap(x => x._2)
} else {
// If HDU not implemented, return an empty RDD
loadOneEmpty
}
rdd
}
/**
* Return an empty RDD of Row.
*
* @return (RDD[Row]) Empty RDD.
*/
def loadOneEmpty : RDD[Row] = {
sqlContext.sparkContext.emptyRDD[Row]
}
/**
* Register user parameters in the configuration (broadcasted).
*/
def registerConfigurations: Unit = {
for (keyAndVal <- parameters) {
conf.set(keyAndVal._1, keyAndVal._2)
extraOptions += (keyAndVal._1 -> keyAndVal._2)
}
if (conf.get("mode") == null) {
conf.set("mode", "PERMISSIVE")
extraOptions += ("mode" -> "PERMISSIVE")
}
}
/**
* The schema of the DataFrame is inferred from the
* header of the fits HDU directly unless the user specifies it.
*
* @return (StructType) schema for the DataFrame
*/
override def schema: StructType = {
registerConfigurations
userSchema.getOrElse{
val listOfFitsFiles = searchFitsFile(filePath)
val pathFS = new Path(listOfFitsFiles(0))
val fits = new Fits(pathFS, conf, conf.get("hdu").toInt)
// Register header and block boundaries
// in the Hadoop configuration for later re-use
fits.registerHeader
fits.blockBoundaries.register(pathFS, conf)
getSchema(fits)
}
}
/**
* Create RDD[Row] from FITS HDU data.
*
* @return (RDD[Row])
*/
override def buildScan(): RDD[Row] = {
// Register the user parameters in the Hadoop conf
registerConfigurations
// Level of verbosity. Default is false
verbosity = Try{extraOptions("verbose")}.getOrElse("false").toBoolean
// Distribute the data
load(filePath)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy