
io.projectglow.sql.BigFileDatasource.scala Maven / Gradle / Ivy
/*
* Copyright 2019 The Glow Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.projectglow.sql
import java.net.URI
import java.util.ServiceLoader
import scala.collection.JavaConverters._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
import io.projectglow.common.{GlowLogging, WithUtils}
/**
* Base class for big file datasources. Handles plumbing that's necessary for all such sources:
* - Checking the save mode
* - Uploading an RDD of byte arrays
*/
abstract class BigFileDatasource extends CreatableRelationProvider {
/**
* Implemented by subclasses. Must return an RDD where each partition is exactly 1 byte array.
*/
protected def serializeDataFrame(options: Map[String, String], df: DataFrame): RDD[Array[Byte]]
override def createRelation(
sqlContext: SQLContext,
mode: SaveMode,
options: Map[String, String],
data: DataFrame): BaseRelation = {
val path = BigFileDatasource.checkPath(options)
val filesystemPath = new Path(path)
val hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConfWithOptions(options)
val fs = filesystemPath.getFileSystem(hadoopConf)
val doSave = if (fs.exists(filesystemPath)) {
mode match {
case SaveMode.Append =>
sys.error(s"Append mode is not supported by ${this.getClass.getCanonicalName}")
case SaveMode.Overwrite =>
fs.delete(filesystemPath, true)
true
case SaveMode.ErrorIfExists =>
sys.error(s"Path $path already exists.")
case SaveMode.Ignore => false
}
} else {
true
}
if (doSave) {
WithUtils.withCachedDataset(data) { cachedDs =>
val byteRdd = serializeDataFrame(options, cachedDs)
SingleFileWriter.write(byteRdd, path, hadoopConf)
}
}
SingleFileRelation(sqlContext, data.schema)
}
}
object BigFileDatasource {
def checkPath(parameters: Map[String, String]): String = {
parameters.getOrElse("path", sys.error("'path' must be specified"))
}
}
case class SingleFileRelation(sqlContext: SQLContext, schema: StructType) extends BaseRelation
trait BigFileUploader {
def canUpload(path: String, conf: Configuration): Boolean
def upload(bytes: RDD[Array[Byte]], path: String, conf: Configuration): Unit
}
object SingleFileWriter extends GlowLogging {
lazy val uploaders: Seq[BigFileUploader] = ServiceLoader
.load(classOf[BigFileUploader])
.iterator()
.asScala
.toSeq
/**
* Writes a single file in parallel to a storage system.
*
* Infers the destination storage system from the provided path.
*
* @param rdd The RDD to write.
* @param path The path to write the RDD to.
*/
def write(rdd: RDD[Array[Byte]], path: String, hadoopConf: Configuration) {
val uri = new URI(path)
uploaders.find(_.canUpload(path, hadoopConf)) match {
case Some(uploader) => uploader.upload(rdd, path, hadoopConf)
case None =>
logger.info(s"Could not find a parallel uploader for $path, uploading from the driver")
writeFileFromDriver(new Path(uri), rdd, hadoopConf)
}
}
private def writeFileFromDriver(
path: Path,
byteRdd: RDD[Array[Byte]],
hadoopConf: Configuration): Unit = {
val fs = path.getFileSystem(hadoopConf)
WithUtils.withCloseable(fs.create(path)) { stream =>
WithUtils.withCachedRDD(byteRdd) { cachedRdd =>
cachedRdd.count()
cachedRdd.toLocalIterator.foreach { chunk =>
stream.write(chunk)
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy