All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.databricks.spark.xml.parsers.StaxXmlParserUtils.scala Maven / Gradle / Ivy

The newest version!
package com.databricks.spark.xml.parsers

import javax.xml.stream.XMLEventReader
import javax.xml.stream.events._

import scala.annotation.tailrec

import com.databricks.spark.xml.XmlOptions

private[xml] object StaxXmlParserUtils {
  /**
   * Skips elements until this meets the given type of a element
   */
  def skipUntil(parser: XMLEventReader, eventType: Int): XMLEvent = {
    var event = parser.peek
    while(parser.hasNext && event.getEventType != eventType) {
      event = parser.nextEvent
    }
    event
  }

  /**
   * Checks if current event points the EndElement.
   */
  @tailrec
  def checkEndElement(parser: XMLEventReader): Boolean = {
    parser.peek match {
      case _: EndElement | _: EndDocument => true
      case _: StartElement => false
      case _ =>
        // When other events are found here rather than `EndElement` or `StartElement`
        // , we need to look further to decide if this is the end because this can be
        // whitespace between `EndElement` and `StartElement`.
        parser.nextEvent
        checkEndElement(parser)
    }
  }

  /**
   * Produces values map from given attributes.
   */
  def convertAttributesToValuesMap(
      attributes: Array[Attribute],
      options: XmlOptions): Map[String, String] = {
    if (options.excludeAttributeFlag) {
      Map.empty[String, String]
    } else {
      val attrFields = attributes.map(options.attributePrefix + _.getName.getLocalPart)
      val attrValues = attributes.map(_.getValue)
      val nullSafeValues = {
        if (options.treatEmptyValuesAsNulls) {
          attrValues.map (v => if (v.trim.isEmpty) null else v)
        } else {
          attrValues
        }
      }
      attrFields.zip(nullSafeValues).toMap
    }
  }


  /**
   * Convert the current structure of XML document to a XML string.
   */
  def currentStructureAsString(parser: XMLEventReader): String = {
    // (Hyukjin) I could not find a proper method to produce the current document
    // as a string. For Jackson, there is a method `copyCurrentStructure()`.
    // So, it ended up with manually converting event by event to string.
    def convertChildren(): String = {
      var childrenXmlString = ""
      parser.peek match {
        case _: StartElement =>
          childrenXmlString += currentStructureAsString(parser)
        case c: Characters if c.isWhiteSpace =>
          // There can be a `Characters` event between `StartElement`s.
          // So, we need to check further to decide if this is a data or just
          // a whitespace between them.
          childrenXmlString += c.getData
          parser.next
          parser.peek match {
            case _: StartElement =>
              childrenXmlString += currentStructureAsString(parser)
            case _: XMLEvent =>
              // do nothing
          }
        case c: Characters =>
          childrenXmlString += c.getData
        case _: XMLEvent =>
          // do nothing
      }
      childrenXmlString
    }

    var xmlString = ""
    var shouldStop = false
    while (!shouldStop) {
      parser.nextEvent match {
        case e: StartElement =>
          xmlString += "<" + e.getName + ">"
          xmlString += convertChildren()
        case e: EndElement =>
          xmlString += ""
          shouldStop = checkEndElement(parser)
        case _: XMLEvent => // do nothing
      }
    }
    xmlString
  }

  /**
   * Skip the children of the current XML element.
   */
  def skipChildren(parser: XMLEventReader): Unit = {
    var shouldStop = checkEndElement(parser)
    while (!shouldStop) {
      parser.nextEvent match {
        case _: StartElement =>
          val e = parser.peek
          if (e.isCharacters && e.asCharacters.isWhiteSpace) {
            // There can be a `Characters` event between `StartElement`s.
            // So, we need to check further to decide if this is a data or just
            // a whitespace between them.
            parser.next
          }
          if (parser.peek.isStartElement) {
            skipChildren(parser)
          }
        case _: EndElement =>
          shouldStop = checkEndElement(parser)
        case _: XMLEvent => // do nothing
      }
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy