
geotrellis.spark.io.file.FileRDDReader.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of geotrellis-spark_2.11 Show documentation
Show all versions of geotrellis-spark_2.11 Show documentation
GeoTrellis is an open source geographic data processing engine for high performance applications.
The newest version!
package geotrellis.spark.io.file
import geotrellis.spark._
import geotrellis.spark.io.avro.codecs.KeyValueRecordCodec
import geotrellis.spark.io.index.{MergeQueue, KeyIndex, IndexRanges}
import geotrellis.spark.io.avro.{AvroEncoder, AvroRecordCodec}
import geotrellis.spark.util.KryoWrapper
import geotrellis.util.Filesystem
import org.apache.avro.Schema
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import spire.syntax.cfor._
import scala.collection.mutable
import scala.reflect.ClassTag
import java.io.File
object FileRDDReader {
def read[K: AvroRecordCodec: Boundable, V: AvroRecordCodec](
keyPath: Long => String,
queryKeyBounds: Seq[KeyBounds[K]],
decomposeBounds: KeyBounds[K] => Seq[(Long, Long)],
filterIndexOnly: Boolean,
writerSchema: Option[Schema] = None,
numPartitions: Option[Int] = None
)(implicit sc: SparkContext): RDD[(K, V)] = {
if(queryKeyBounds.isEmpty) return sc.emptyRDD[(K, V)]
val ranges = if (queryKeyBounds.length > 1)
MergeQueue(queryKeyBounds.flatMap(decomposeBounds))
else
queryKeyBounds.flatMap(decomposeBounds)
val bins = IndexRanges.bin(ranges, numPartitions.getOrElse(sc.defaultParallelism))
val boundable = implicitly[Boundable[K]]
val includeKey = (key: K) => KeyBounds.includeKey(queryKeyBounds, key)(boundable)
val _recordCodec = KeyValueRecordCodec[K, V]
val kwWriterSchema = KryoWrapper(writerSchema) //Avro Schema is not Serializable
sc.parallelize(bins, bins.size)
.mapPartitions { partition: Iterator[Seq[(Long, Long)]] =>
val resultPartition = mutable.ListBuffer[(K, V)]()
for(
rangeList <- partition;
range <- rangeList
) {
val (start, end) = range
cfor(start)(_ <= end, _ + 1) { index =>
val path = keyPath(index)
if(new File(path).exists) {
val bytes: Array[Byte] = Filesystem.slurp(path)
val recs = AvroEncoder.fromBinary(kwWriterSchema.value.getOrElse(_recordCodec.schema), bytes)(_recordCodec)
resultPartition ++= {
if(filterIndexOnly)
recs
else
recs.filter { row => includeKey(row._1) }
}
}
}
}
resultPartition.iterator
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy