com.databricks.spark.xml.package.scala Maven / Gradle / Ivy
The newest version!
* Copyright 2014 Databricks
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package com.databricks.spark
import scala.collection.Map
import org.apache.spark.sql._
import com.databricks.spark.xml.util.XmlFile
package object xml {
* Adds a method, `xmlFile`, to [[SQLContext]] that allows reading XML data.
implicit class XmlContext(sqlContext: SQLContext) extends Serializable {
@deprecated("Use read.format(\"xml\") or read.xml", "0.4.0")
def xmlFile(
filePath: String,
rowTag: String = XmlOptions.DEFAULT_ROW_TAG,
samplingRatio: Double = 1.0,
excludeAttribute: Boolean = false,
treatEmptyValuesAsNulls: Boolean = false,
failFast: Boolean = false,
attributePrefix: String = XmlOptions.DEFAULT_ATTRIBUTE_PREFIX,
valueTag: String = XmlOptions.DEFAULT_VALUE_TAG,
charset: String = XmlOptions.DEFAULT_CHARSET): DataFrame = {
val parameters = Map(
"rowTag" -> rowTag,
"samplingRatio" -> samplingRatio.toString,
"excludeAttribute" -> excludeAttribute.toString,
"treatEmptyValuesAsNulls" -> treatEmptyValuesAsNulls.toString,
"failFast" -> failFast.toString,
"attributePrefix" -> attributePrefix,
"valueTag" -> valueTag,
"charset" -> charset)
val xmlRelation = XmlRelation(
() => XmlFile.withCharset(sqlContext.sparkContext, filePath, charset, rowTag),
location = Some(filePath),
parameters = parameters.toMap)(sqlContext)
* Adds a method, `saveAsXmlFile`, to [[DataFrame]] that allows writing XML data.
* If compressionCodec is not null the resulting output will be compressed.
* Note that a codec entry in the parameters map will be ignored.
implicit class XmlSchemaRDD(dataFrame: DataFrame) {
@deprecated("Use write.format(\"xml\") or write.xml", "0.4.0")
def saveAsXmlFile(
path: String, parameters: Map[String, String] = Map(),
compressionCodec: Class[_ <: CompressionCodec] = null): Unit = {
val mutableParams = collection.mutable.Map(parameters.toSeq: _*)
val safeCodec = mutableParams.get("codec")
mutableParams.put("codec", safeCodec)
XmlFile.saveAsXmlFile(dataFrame, path, mutableParams.toMap)
* Adds a method, `xml`, to DataFrameReader that allows you to read avro files using
* the DataFileReader
implicit class XmlDataFrameReader(reader: DataFrameReader) {
def xml: String => DataFrame = reader.format("com.databricks.spark.xml").load
* Adds a method, `xml`, to DataFrameWriter that allows you to write avro files using
* the DataFileWriter
implicit class XmlDataFrameWriter[T](writer: DataFrameWriter[T]) {
// Note that writing a XML file from [[DataFrame]] having a field [[ArrayType]] with
// its element as [[ArrayType]] would have an additional nested field for the element.
// For example, the [[DataFrame]] having a field below,
// fieldA [[data1, data2]]
// would produce a XML file below.
// - data1
// - data2
// Namely, roundtrip in writing and reading can end up in different schema structure.
def xml: String => Unit = writer.format("com.databricks.spark.xml").save