com.databricks.spark.xml.DefaultSource.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2014 Databricks
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.databricks.spark.xml
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
import com.databricks.spark.xml.util.XmlFile
/**
* Provides access to XML data from pure SQL statements (i.e. for users of the
* JDBC server).
*/
class DefaultSource
extends RelationProvider
with SchemaRelationProvider
with CreatableRelationProvider
with DataSourceRegister {
/**
* Short alias for spark-xml data source.
*/
override def shortName(): String = "xml"
private def checkPath(parameters: Map[String, String]): String = {
parameters.getOrElse("path", sys.error("'path' must be specified for XML data."))
}
/**
* Creates a new relation for data store in XML given parameters.
* Parameters have to include 'path'.
*/
override def createRelation(
sqlContext: SQLContext,
parameters: Map[String, String]): BaseRelation = {
createRelation(sqlContext, parameters, null)
}
/**
* Creates a new relation for data store in XML given parameters and user supported schema.
* Parameters have to include 'path'.
*/
override def createRelation(
sqlContext: SQLContext,
parameters: Map[String, String],
schema: StructType): XmlRelation = {
val path = checkPath(parameters)
// We need the `charset` and `rowTag` before creating the relation.
val (charset, rowTag) = {
val options = XmlOptions(parameters)
(options.charset, options.rowTag)
}
XmlRelation(
() => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag),
Some(path),
parameters,
schema)(sqlContext)
}
override def createRelation(
sqlContext: SQLContext,
mode: SaveMode,
parameters: Map[String, String],
data: DataFrame): BaseRelation = {
val path = checkPath(parameters)
val filesystemPath = new Path(path)
val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
val doSave = if (fs.exists(filesystemPath)) {
mode match {
case SaveMode.Append =>
sys.error(s"Append mode is not supported by ${this.getClass.getCanonicalName}")
case SaveMode.Overwrite =>
fs.delete(filesystemPath, true)
true
case SaveMode.ErrorIfExists =>
sys.error(s"path $path already exists.")
case SaveMode.Ignore => false
}
} else {
true
}
if (doSave) {
// Only save data when the save mode is not ignore.
XmlFile.saveAsXmlFile(data, filesystemPath.toString, parameters)
}
createRelation(sqlContext, parameters, data.schema)
}
}