All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datasonnet.plugins.xml.BadgerFishHandler.scala Maven / Gradle / Ivy

package com.datasonnet.plugins.xml

/*-
 * Copyright 2019-2020 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import com.datasonnet.plugins.DefaultXMLFormatPlugin.{DEFAULT_NS_KEY, EffectiveParams}
import org.xml.sax.ext.DefaultHandler2
import org.xml.sax.{Attributes, SAXParseException}
import ujson.Value

import scala.collection.immutable.Set
import scala.collection.mutable

// See {@link scala.xml.parsing.FactoryAdapter}
class BadgerFishHandler(params: EffectiveParams) extends DefaultHandler2 {
  def result: ujson.Obj = collateObj(badgerStack.top)

  val buffer = new StringBuilder()
  val badgerStack = new mutable.Stack[Element]
  // ignore text until after first element starts
  var capture: Boolean = false

  private var needNewContext = true
  private val namespaceParts = new Array[String](3)  // keep reusing a single array
  private val namespaces = new OverridingNamespaceTranslator(params.declarations)
  private var currentNS: mutable.LinkedHashMap[String, ujson.Str] = mutable.LinkedHashMap()

  // root
  badgerStack.push(Element(ujson.Obj()))

  override def startPrefixMapping(prefix: String, uri: String): Unit = {
    if(needNewContext) {
      namespaces.pushContext()
      needNewContext = false
      if(currentNS.nonEmpty) currentNS = currentNS.empty
    }
    namespaces.declarePrefix(prefix, uri)
    val newPrefix = namespaces.getPrefix(uri)
    currentNS.put(if (newPrefix == null) DEFAULT_NS_KEY else newPrefix, ujson.Str(uri))
  }

  override def startElement(uri: String,
                            _localName: String,
                            qname: String,
                            attributes: Attributes): Unit = {
    if (needNewContext) namespaces.pushContext()
    needNewContext = true

    captureText()
    capture = true

    val current = ujson.Obj()
    if (currentNS.nonEmpty) {
      current.value.addOne((params.xmlnsKey, currentNS))
      currentNS = currentNS.empty
    }

    if (attributes.getLength > 0) {
      val attrs = mutable.ListBuffer[(String, ujson.Str)]()

      for (i <- 0 until attributes.getLength) {
        val qname = attributes getQName i
        val value = attributes getValue i
        val translated = processName(qname, true)

        attrs.addOne((params.attrKeyPrefix + translated, ujson.Str(value)))
      }
      current.value.addAll(attrs.toList)
    }

    badgerStack.push(Element(current))
  }

  override def characters(ch: Array[Char], offset: Int, length: Int): Unit = {
    if (capture) buffer.appendAll(ch, offset, length)
  }

  override def startCDATA(): Unit = {
    captureText()
  }

  override def endCDATA(): Unit = {
    if (buffer.nonEmpty) {
      badgerStack.top.children = badgerStack.top.children :+ CData(buffer.toString)
    }
    buffer.clear()
  }

  def examineChildren(children: List[Node]): ChildrenCase.Value = {
    val classes = children.map(_.getClass).toSet
    if (classes == Set(classOf[Text])) {
      ChildrenCase.ALL_SIMPLE_TEXT
    } else if (classes subsetOf Set(classOf[Text], classOf[CData])) {
      ChildrenCase.MIXED_TEXT
    } else if (classes == Set(classOf[Element])) {
      if(wellOrdered(children.asInstanceOf[List[Element]])) {
        ChildrenCase.STRUCTURED_CONTENT
      } else {
        ChildrenCase.OUT_OF_ORDER_ELEMENTS
      }
    } else {
      ChildrenCase.MIXED_CONTENT
    }
  }

  def wellOrdered(children: List[Element]): Boolean = {
    val found = mutable.Set[String]()
    var last = ""
    children.map(_.name).forall(v => if (last == v || !found.contains(v)) {
      found.add(v)
      last = v
      true
    } else {
      false
    })
  }

  def mixedNodes(children: List[Node]): IterableOnce[(String, Value)] = {
    val values = mutable.LinkedHashMap[String, Value]()
    children.zipWithIndex.foreach {
      case (node, index) =>
        val index1 = index + 1
        node match {
          case Text(value) => values += (params.textKeyPrefix + index1 -> ujson.Str(value))
          case CData(value) => values += (params.cdataKeyPrefix + index1 -> ujson.Str(value))
          case Element(obj, name, _) => {
            obj.value += (params.orderingKey -> index1)
            if (values.contains(name)) {
              (values(name): @unchecked) match {
                case ujson.Arr(arr) => arr += obj
                case ujson.Obj(existing) => values += (name -> ujson.Arr(existing, obj))
              }
            } else {
              values += (name -> obj)
            }
          }
        }
    }
    values
  }

  def combinedElements(children: List[Element]): IterableOnce[(String, Value)] = {
    val values = mutable.LinkedHashMap[String, Value]()
    // note, do not use groupBy as it does not provide the ordering guarantee we need
    children.foreach(value => {
      if (values.contains(value.name)) {
        (values(value.name): @unchecked) match {
          case ujson.Arr(arr) => arr.addOne(value.obj)
          case ujson.Obj(existing) => values(value.name) = ujson.Arr(existing, value.obj)
        }
      } else {
        values(value.name) = value.obj
      }
    })
    values
  }

  override def endElement(uri: String, _localName: String, qname: String): Unit = {
    captureText()

    val translated = processName(qname, false)
    val newName = translated.replaceFirst(":", params.nsSeparator)
    val current = badgerStack.pop
    val parent = badgerStack.top

    // don't include the current children because we've already handled them
    parent.children = parent.children :+ Element(collateObj(current), newName)

    capture = badgerStack.size != 1 // root level
    namespaces.popContext()
  }

  private def collateObj(current: Element): ujson.Obj = {
    val children = current.children
    val childrenCase = examineChildren(children)

    childrenCase match {
      case ChildrenCase.ALL_SIMPLE_TEXT => {
        // they are all text nodes, so just combine them into one and add that
        val combined = children.asInstanceOf[List[Textual]].map(_.value).mkString("")
        current.obj.value += (params.textKeyPrefix -> ujson.Str(combined))
      }
      case ChildrenCase.MIXED_TEXT => {
        // all textual nodes, so combine them
        val combined = children.asInstanceOf[List[Textual]].map(_.value).mkString("")
        current.obj.value += (params.textKeyPrefix -> ujson.Str(combined))

        // but also preserve them as individual nodes just in case that's of interest
        current.obj.value ++= mixedNodes(children)
      }
      case _ => current.obj.value ++= mixedNodes(children)
    }
    current.obj
  }

  private def processName(qname: String, isAttribute: Boolean) = {
    // while processName can return null here, it will only do so if the XML
    // namespace processing is written incorrectly, so if you see this line in a stack trace,
    // go verifying what namespace-related calls have been made
    namespaces.processName(qname, namespaceParts, isAttribute)(2)
  }

  def captureText(): Unit = {
    if (capture && buffer.nonEmpty) {
      val string = buffer.toString
      // TODO: change to a isNotBlank func
      if (string.trim.nonEmpty) {
        badgerStack.top.children = badgerStack.top.children :+ Text(string)
      }
    }

    buffer.clear()
  }

  override def warning(ex: SAXParseException): Unit = {}

  override def error(ex: SAXParseException): Unit = printError("Error", ex)

  override def fatalError(ex: SAXParseException): Unit = printError("Fatal Error", ex)

  protected def printError(errtype: String, ex: SAXParseException): Unit =
    Console.withOut(Console.err) {
      val s = "[%s]:%d:%d: %s".format(
        errtype, ex.getLineNumber, ex.getColumnNumber, ex.getMessage)
      Console.println(s)
      Console.flush()
    }



}


object ChildrenCase extends Enumeration {
  val ALL_SIMPLE_TEXT, MIXED_TEXT, OUT_OF_ORDER_ELEMENTS, MIXED_CONTENT, STRUCTURED_CONTENT = Value
}

sealed abstract class Node
abstract class Textual(val value: String) extends Node
case class Text(override val value: String) extends Textual(value)
case class CData(override val value: String) extends Textual(value)
case class Element(obj: ujson.Obj, name: String = "", var children: List[Node] = List()) extends Node




© 2015 - 2025 Weber Informatics LLC | Privacy Policy