
edu.ucr.cs.bdlab.beast.io.SpatialFileSource.scala Maven / Gradle / Ivy
package edu.ucr.cs.bdlab.beast.io
import com.fasterxml.jackson.databind.ObjectMapper
import edu.ucr.cs.bdlab.beast.common.BeastOptions
import edu.ucr.cs.bdlab.beast.geolite.IFeature
import org.apache.commons.io.IOCase
import org.apache.commons.io.filefilter.WildcardFileFilter
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter}
import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
import org.apache.spark.paths.SparkPath
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.connector.catalog._
import org.apache.spark.sql.connector.expressions.Transform
import org.apache.spark.sql.connector.read._
import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile}
import org.apache.spark.sql.sources.DataSourceRegister
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap
import java.io.File
import java.util
import scala.reflect.ClassTag
/**
* A generic SparkSQL data source for spatial files. That is, any class that extends [[FeatureReader]].
* The class can still be extended to provide any additional tweaks, such as, projection push down, if supported.
*/
abstract class SpatialFileSource[FR <: FeatureReader](implicit t: ClassTag[FR])
extends TableProvider with DataSourceRegister {
// TODO we need to implement a fall back to V1 source with FileFormat due to a problem with Spark
/**
* Returns the schema of the file by opening the first split and reading one feature from it.
* Subclasses might be able to provide a better implementation that does not require parsing the first record.
* @param options an immutable case-insensitive string-to-string map that can identify a table,
* e.g. file path, Kafka topic name, etc
* @return the schema of the input file
*/
override def inferSchema(options: CaseInsensitiveStringMap): StructType = {
val inputPaths = SpatialFileSourceHelper.getSpatialFilePaths[FR](options).map(new Path(_))
lazy val reader = t.runtimeClass.getConstructor().newInstance().asInstanceOf[FeatureReader]
for (inputPath <- inputPaths) {
try {
val fileSystem = inputPath.getFileSystem(SpatialFileSourceHelper.hadoopConf)
val length = fileSystem.getFileStatus(inputPath).getLen
val filesplit = new FileSplit(inputPath, 0, length, Array())
reader.initialize(filesplit, BeastOptions.fromMap(options))
if (reader.hasNext) {
return reader.next().schema
}
} finally {
reader.close()
}
}
StructType(Seq())
}
override def getTable(schema: StructType, transforms: Array[Transform], options: util.Map[String, String]): Table =
new SpatialFileTable[FR](schema, options)
override def supportsExternalMetadata(): Boolean = true
}
/**
* A generic SparkSQL data source and sink for spatial files. It works on a pair of classes that extend
* [[FeatureReader]] and [[FeatureWriter]]
* The class can still be extended to provide any additional tweaks, such as, projection push down, if supported.
*/
abstract class SpatialFileSourceSink[FR <: FeatureReader, FW <: FeatureWriter]
(implicit t: ClassTag[FR], t2: ClassTag[FW]) extends SpatialFileSource[FR] {
override def getTable(schema: StructType, transforms: Array[Transform], options: util.Map[String, String]): Table = {
new SpatialFileTableRW[FR, FW](schema, options)
}
}
/**
* A class that represents a table of spatial data.
* @param schema the schema of the table including both spatial and non-spatial attributes
* @param options additional user options
* @param t the type of the feature reader
* @tparam FR the class of the feature reader
*/
class SpatialFileTable[FR <: FeatureReader](override val schema: StructType, options: util.Map[String, String])
(implicit t: ClassTag[FR]) extends SupportsRead {
override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder =
new SpatialFileScanBuilder(schema, options)
override def name(): String = {
val shortname: String = t.runtimeClass.getAnnotation(classOf[SpatialReaderMetadata]).shortName()
val filename: String = SpatialFileSourceHelper.getPaths(options).head
s"${shortname}:${filename}"
}
override def capabilities(): util.Set[TableCapability] = {
val capabilities = new util.HashSet[TableCapability]()
capabilities.add(TableCapability.BATCH_READ)
capabilities
}
}
class SpatialFileTableRW[FR <: FeatureReader, FW <: FeatureWriter]
(override val schema: StructType, options: util.Map[String, String])(implicit t: ClassTag[FR], t2: ClassTag[FW])
extends SpatialFileTable[FR](schema, options) with SupportsWrite {
override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = new SpatialWriteBuilder(info)
override def capabilities(): util.Set[TableCapability] = {
val capabilities = super.capabilities()
capabilities.add(TableCapability.BATCH_WRITE)
capabilities.add(TableCapability.TRUNCATE)
capabilities
}
}
object SpatialFileSourceHelper {
/**The minimum size of a split*/
val MinSplitSize: String = FileInputFormat.SPLIT_MINSIZE
/**The maximum size of a split*/
val MaxSplitSize: String = FileInputFormat.SPLIT_MAXSIZE
/**A path filter that skips hidden files which begin with "." or "_"*/
val HiddenFileFilter: PathFilter = (p: Path) => p.getName.charAt(0) != '_' && p.getName.charAt(0) != '.'
/**A path filter that only selects master files*/
val MasterFileFilter: PathFilter = (p: Path) => p.getName.startsWith("_master")
/**The active Spark session*/
lazy val sparkSession = SparkSession.active
/**The Hadoop configuration is used to initialize [[org.apache.hadoop.fs.FileSystem]] instances for HDFS */
lazy val hadoopConf: Configuration = sparkSession.sessionState.newHadoopConf()
/**
* Get a sequence of all input paths specified by the user.
* @param options the user-provided options
* @return a sequence of all input paths as strings
*/
def getPaths(options: util.Map[String, String]): Seq[String] = {
val objectMapper = new ObjectMapper()
val paths = Option(options.get("paths")).map { pathStr =>
objectMapper.readValue(pathStr, classOf[Array[String]]).toSeq
}.getOrElse(Seq.empty)
paths ++ Option(options.get("path")).toSeq
}
/**
* Get a list of all files that can be read in the input. This includes all .shp and all .zip files.
* @param options the options that the user specified including the input
*/
def getSpatialFilePaths[FR <: FeatureReader](options: util.Map[String, String])(implicit t: ClassTag[FR]): Seq[String] = {
val paths = getPaths(options)
val metadata: SpatialReaderMetadata = t.runtimeClass.getAnnotation(classOf[SpatialReaderMetadata])
def filematch: PathFilter = if (metadata.filter() == null || metadata.filter().isEmpty) {
_: Path => true
} else {
val wildcard = metadata.filter()
val wildcardFilter = new WildcardFileFilter(wildcard.split("\n"), IOCase.SYSTEM)
path: Path => wildcardFilter.accept(new File(path.getName))
}
paths.flatMap(in => {
val path = new Path(in)
val filesystem = path.getFileSystem(hadoopConf)
if (filesystem.isFile(path) && filematch.accept(path))
Option(in).iterator
else if (filesystem.isDirectory(path))
filesystem.listStatus(path, filematch).map(_.getPath.toString)
else
Option.empty[String].iterator
})
}
/**
* Adds a path to the list of splits by creating the appropriate file splits. If the file is splittable,
* multiple splits might be added.
*
* @param fileSystem the file system that contains the input
* @param fileStatus the status of the input file
* @param start the start offset in the file
* @param length the length of the part of the file to add
* @param noSplit a boolean flag that is set to avoid splitting files
* @param partitionInfo information about the spatial attributes of the partition or `null` if unknown
* @param _partitions (output) the created partitions are added to this list
*/
def addPartitions(fileSystem: FileSystem, fileStatus: FileStatus,
start: Long, length: Long, noSplit: Boolean,
partitionInfo: IFeature, options: util.Map[String, String],
_partitions: collection.mutable.ArrayBuffer[InputPartition]): Unit = {
val splitSize = if (noSplit) length else {
val minSize = options.getOrDefault(MinSplitSize, "1").toLong
val maxSize = options.getOrDefault(MaxSplitSize, Long.MaxValue.toString).toLong
Math.max(minSize, Math.min(maxSize, fileStatus.getBlockSize))
}
val blkLocations = fileSystem.getFileBlockLocations(fileStatus, start, length)
var partitionStart = start
val fileEnd = start + length
val SPLIT_SLOP = 1.1 // 10% slop
while (partitionStart < fileEnd) {
val blkIndex = blkLocations.find(bl => partitionStart >= bl.getOffset && partitionStart < bl.getOffset + bl.getLength)
assert(blkIndex.isDefined, s"No locations found for block at offset $partitionStart")
val partitionEnd = if ((fileEnd - partitionStart).toDouble / splitSize > SPLIT_SLOP)
partitionStart + splitSize
else
fileEnd
_partitions.append(
if (partitionInfo != null)
new SpatialDataframePartition(_partitions.length, Array(PartitionedFile(null, SparkPath.fromFileStatus(fileStatus), partitionStart,
partitionEnd - partitionStart, blkIndex.get.getHosts)), partitionInfo)
else
new FilePartition(_partitions.length, Array(PartitionedFile(null, SparkPath.fromFileStatus(fileStatus), partitionStart,
partitionEnd - partitionStart, blkIndex.get.getHosts)))
)
partitionStart = partitionEnd
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy