All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.daffodil.xml.XMLUtils.scala Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.daffodil.xml

import java.io.File
import java.nio.charset.StandardCharsets
import java.nio.file.Files
import java.nio.file.Paths
import java.nio.file.StandardOpenOption
import scala.annotation.tailrec
import scala.collection.mutable
import scala.collection.mutable.ArrayBuilder
import scala.xml.NamespaceBinding
import scala.xml._
import org.apache.commons.io.IOUtils
import org.apache.daffodil.calendar.DFDLDateConversion
import org.apache.daffodil.calendar.DFDLDateTimeConversion
import org.apache.daffodil.calendar.DFDLTimeConversion
import org.apache.daffodil.exceptions._
import org.apache.daffodil.schema.annotation.props.LookupLocation
import org.apache.daffodil.util.Maybe
import org.apache.daffodil.util.Misc
import org.xml.sax.XMLReader

import javax.xml.XMLConstants
import scala.util.matching.Regex

/**
 * Utilities for handling XML
 */

object XMLUtils {

  lazy val schemaForDFDLSchemas =
    Misc.getRequiredResource("org/apache/daffodil/xsd/XMLSchema_for_DFDL.xsd")

  lazy val dafextURI =
    Misc.getRequiredResource("org/apache/daffodil/xsd/dafext.xsd")

  lazy val tdmlURI =
    Misc.getRequiredResource("org/apache/daffodil/xsd/tdml.xsd")
  /**
   * We must have xsi prefix bound to the right namespace.
   * That gets enforced elsewhere.
   */
  val xmlNilAttribute = new PrefixedAttribute("xsi", "nil", "true", scala.xml.Null)

  val PositiveInfinityString = "INF"
  val NegativeInfinityString = "-INF"
  val NaNString = "NaN"

  /**
   * Legal XML v1.0 chars are #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
   *
   * Note that this function is curried. You first close over the parameters of the algorithm
   * to obtain a function that converts individual characters.
   */
  def remapXMLIllegalCharToPUA(
    checkForExistingPUA: Boolean = true,
    replaceCRWithLF: Boolean = true)(c: Char): Char = {
    val cInt = c.toInt
    val res = cInt match {
      case 0x9 => c
      case 0xA => c
      case 0xD =>
        if (replaceCRWithLF) 0xA.toChar // Map CR to LF. That's what XML does.
        else 0xE00D.toChar // or remap it to PUA so it is non-whitespace, and preserved.
      case _ if (cInt < 0x20) => (cInt + 0xE000).toChar // ascii c0 controls
      case _ if (cInt > 0xD7FF && cInt < 0xE000) => (cInt + 0x1000).toChar // surrogate code points
      case _ if (cInt >= 0xE000 && cInt <= 0xF8FF) => { // Unicode PUA is E000 to F8FF.
        if (checkForExistingPUA)
          Assert.usageError("Pre-existing Private Use Area (PUA) character found in data: '%s'".format(c))
        else c
      }
      case 0xFFFE => 0xF0FE.toChar
      case 0xFFFF => 0xF0FF.toChar
      case _ if (cInt > 0x10FFFF) => {
        Assert.invariantFailed("Character code beyond U+10FFFF found in data. Codepoint: %s".format(cInt))
      }
      case _ => c

    }
    res
  }

  /**
   * Scans the string looking for XML-illegal characters. True if any are found.
   *
   * Note that this considers CR (0x0d) to be a character that requires remapping.
   */
  def needsXMLToPUARemapping(s: String): Boolean = {
    var i = 0
    val len = s.length
    while (i < len) {
      val v = s.charAt(i).toInt
      if ((v < 0x20 && !(v == 0xA || v == 0x9)) ||
        (v > 0xD7FF && v < 0xE000) ||
        (v >= 0xE000 && v <= 0xF8FF) ||  // Unicode PUA is E000 to F8FF
        (v == 0xFFFE) ||
        (v == 0xFFFF) ||
        (v > 0x10FFFF)) {
        return true
      }
      i += 1
    }
    false
  }

  /**
   * Reverse of the remapXMLIllegalCharToPUA method
   */
  def remapPUAToXMLIllegalChar(c: Char): Char = {
    val cInt = c.toInt
    val res = cInt match {
      case _ if (c >= 0xE000 && c <= 0xE01F) => (c - 0xE000).toChar // Ascii c0 controls
      case _ if (c >= 0xE800 && c <= 0xEFFF) => (c - 0x1000).toChar // surrogate codepoints
      case 0xF0FE => 0xFFFE.toChar
      case 0xF0FF => 0xFFFF.toChar
      case _ if (c > 0x10FFFF) => {
        Assert.invariantFailed("Character code beyond U+10FFFF found in data. Codepoint: %s".format(c.toInt))
      }
      case _ => c
    }
    res
  }

  /**
   * Determines if we need to unmap PUA-mapped characters back to the (XML illegal) original characters.
   *
   * Used to save allocating a string every time, given that these PUA mapped chars are rare.
   */
  def needsPUAToXMLRemapping(s: String): Boolean = {
    var i = 0
    val len = s.length
    while (i < len) {
      val v = s.charAt(i).toInt
      if ((v == 0xD) || // not PUA, but string still needs remapping since CR must be mapped to LF
          (v >= 0xE000 && v <= 0xE01F) || // PUA chars that are Ascii C0 controls.
          (v >= 0xE800 && v <= 0xEFFF) || // Surrogate codepoints
          (v == 0xF0FE) || (v == 0xF0FF) || // FFFE and FFFF illegal chars
          (v > 0x10FFFF)) {
        return true
      }
      i += 1
    }
    false
  }

  def isLeadingSurrogate(c: Char) = {
    c >= 0xD800 && c <= 0xDBFF
  }

  def isTrailingSurrogate(c: Char) = {
    c >= 0xDC00 && c <= 0xDFFF
  }

  /**
   * Length where a surrogate pair counts as 1 character, not two.
   */
  def uncodeLength(s: String) = {
    // performance note: this might get called a lot. So needs to be fast.
    // it needs to scan the string once, examine each character.
    // using getBytes utf-32 isn't necessarily slow. It might be fine.
    val res = s.getBytes("UTF-32BE").length / 4
    res
  }

  /**
   * Because of surrogate pairs, and the difference between 16-bit string codepoints
   * and real character codes, lots of things that traverse strings need
   * to consider either the codepoint after (if current is a leading surrogate)
   * or codepoint before (if current is a trailing surrogate).
   *
   * This calls a body function with prev, current, next bound to those.
   * For first codepoint prev will be 0. For last codepoint next will be 0.
   *
   * NOTE: This function contains the same algorithm as
   * remapXMLIllegalCharactersToPUA, but is more general and is a bit slower.
   * Any changes made to this function probably need to be incorporated into
   * the other.
   */
  def walkUnicodeString[T](str: String)(bodyFunc: (Char, Char, Char) => T): Seq[T] = {
    val len = str.length
    if (len == 0) return Nil

    val list = new scala.collection.mutable.ListBuffer[T]

    var pos = 0;
    var prev = 0.toChar
    var curr = str(0)
    var next = 0.toChar

    while (pos < len) {
      next = if (pos + 1 < len) str(pos + 1) else 0.toChar
      list += bodyFunc(prev, curr, next)
      prev = curr
      curr = next

      pos += 1
    }
    list
  }

  /*
   * This function contains the same string traversal algorithm as
   * walkUnicodeString. The only difference is that it uses a StringBuilder
   * rather than a ListBuffer[T] that would be used in walkUnicodeString. Note
   * that since StringBuilder is not synchronized it is noticably faster than
   * StringBuffer, and since the StringBuilder is local to the function, we
   * don't have to worry about any threading issues. This specificity makes for
   * a noticable speed increase, so much so that the code duplication is worth
   * it. Any changes made to this function probably need to be incorporated
   * into the other.
   */
  def remapXMLCharacters(dfdlString: String, remapFunc: (Char) => Char): String = {
    // we want to remap XML-illegal characters
    // but leave legal surrogate-pair character pairs alone.
    def remapOneChar(previous: Char, current: Char, next: Char): Char = {
      if (isLeadingSurrogate(current) && isTrailingSurrogate(next)) return current
      if (isTrailingSurrogate(current) && isLeadingSurrogate(previous)) return current
      remapFunc(current)
    }

    val len = dfdlString.length
    if (len == 0) return dfdlString

    val sb = new StringBuilder()

    var pos = 0;
    var prev = 0.toChar
    var curr = dfdlString(0)
    var next = 0.toChar

    while (pos < len) {
      next = if (pos + 1 < len) dfdlString(pos + 1) else 0.toChar
      if (curr == 0xD) {
        if (next != 0xA) {
          // This is a lone CR (i.e. not a CRLF), so convert the CR to a LF
          sb.append(0xA.toChar)
        } else {
          // This is a CRLF. Skip the CR, essentially converting the CRLF to
          // just LF. Do nothing.
        }
      } else {
        sb.append(remapOneChar(prev, curr, next))
      }
      prev = curr
      curr = next

      pos += 1
    }

    sb.toString
  }

  def remapXMLIllegalCharactersToPUA(dfdlString: String): String = {
    if (needsXMLToPUARemapping(dfdlString)) {
      // This essentially doubles the work if remapping is needed (since we
      // scan the string once to see if it's needed, then scan again for
      // remapping). But the common case is that remapping is not needed, so we
      // only need to scan the string once AND we avoid allocating a new string
      // with characters remapped.
      remapXMLCharacters(dfdlString, remapXMLIllegalCharToPUA(false))
    } else {
      dfdlString
    }
  }

  /**
   * Converts PUA characters back into the original (XML Illegal) characters
   * they represent.
   *
   *
   */
  def remapPUAToXMLIllegalCharacters(dfdlString: String): String = {
    if (needsPUAToXMLRemapping(dfdlString)) {
      // This essentially doubles the work if remapping is needed (since we
      // scan the string once to see if it's needed, then scan again for
      // remapping). But the common case is that remapping is not needed, so we
      // only need to scan the string once AND we avoid allocating a new string
      // with characters remapped.
      remapXMLCharacters(dfdlString, remapPUAToXMLIllegalChar)
    } else {
      dfdlString
    }
  }

  def coalesceAllAdjacentTextNodes(node: Node): Node = {
    node match {
      case Elem(prefix, label, attribs, scope, child @ _*) => {
        val coalescedChildren = child.map(coalesceAllAdjacentTextNodes(_))
        val newChildren = coalesceAdjacentTextNodes(coalescedChildren)
        Elem(prefix, label, attribs, scope, true, newChildren: _*)
      }
      case x => x
    }
  }

  /*
   * This is needed for equality comparison of XML.
   *
   * Ex: "fooȡbar" is 3 nodes, not one string node.
   * But appears to be one string when serialized as XML.
   *
   * Once the XML has been read into XML objects, the 3 objects
   * are just 3 adjacent text nodes, so adjacent text nodes
   * can be coalesced for use in the DFDL Infoset, or for comparing
   * trees of XML that may have been created different ways.
   */
  def coalesceAdjacentTextNodes(seq: Seq[Node]): Seq[Node] = {
    if (seq.length == 0) return seq
    if (seq.length == 1) {
      seq(0) match {
        case a: Atom[_] => return seq
        case _ => // fall through to code below. (We need to process children)
      }
    }
    val ab = ArrayBuilder.make[Node]
    var i = 0
    //
    // invariant: either the tn node is null
    // or the stringbuilder is null or empty
    //
    // They never both have content.
    //
    var tn: Node = null
    var sb: StringBuilder = null
    def processText() = {
      if (tn == null) {
        if (sb != null && sb.length > 0) {
          // we have accumulated text
          // let's output a text node
          // Note that a Text constructor
          // will escapify the text again.
          // We unescaped it
          // when we used .text to get data
          // out of the nodes.
          ab += new Text(sb.toString)
          sb.clear()
        }
      } else {
        // tn not null
        Assert.invariant(sb == null || sb.length == 0)
        ab += tn
        tn = null
      }
    }
    while (i < seq.length) {
      val current = seq(i)
      i = i + 1
      if ((current.isInstanceOf[Text] || current.isInstanceOf[Unparsed])) {
        if (tn == null) {
          if (sb == null || sb.length == 0) {
            // hold onto this text node. It might be isolated
            tn = current
          } else {
            // accumulate this text
            sb.append(current.text)
          }
        } else {
          if (sb == null) sb = new StringBuilder
          // accumulate both the pending tn text node
          // and this new one we just encountered.
          //
          // Note we use .text here - that unescapifies
          // Which is important since we're putting together
          // things that might be PCData (aka 
          // We want that stuff gone.
          //
          sb.append(tn.text)
          sb.append(current.text)
          //
          // set tn to null to indicate we're accumulating
          // into the string buffer
          //
          tn = null
        }
      } else {
        // not an atom
        processText // if there is pending text output that first
        ab += current // then the current non-atom node.
      }
    }
    // we fell out of the loop. So
    processText // in case there is text left pending when we hit the end
    ab.result
  }

  val XSD_NAMESPACE = NS("http://www.w3.org/2001/XMLSchema") // removed trailing slash (namespaces care)
  val XSI_NAMESPACE = NS("http://www.w3.org/2001/XMLSchema-instance")
  val XPATH_FUNCTION_NAMESPACE = NS("http://www.w3.org/2005/xpath-functions")
  val XPATH_MATH_NAMESPACE = NS("http://www.w3.org/2005/xpath-functions/math")
  val DFDL_NAMESPACE = NS("http://www.ogf.org/dfdl/dfdl-1.0/") // dfdl ns does have a trailing slash
  val DFDLX_NAMESPACE = NS("http://www.ogf.org/dfdl/dfdl-1.0/extensions")
  val TDML_NAMESPACE = NS("http://www.ibm.com/xmlns/dfdl/testData")
  val EXAMPLE_NAMESPACE = NS("http://example.com")
  val XHTML_NAMESPACE = NS("http://www.w3.org/1999/xhtml")

  /**
   * Returns an Elem with local name "element", and the scope provided
   * with the prefix of the Elem setup to match the scope's binding
   * for the XSD_NAMESPACE.
   *
   * If the XSD_NAMESPACE is the default namespace, then the prefix will
   * be null. If the XSD_NAMESPACE is bound to a prefix, the first such
   * prefix will be used for the returned Elem.
   */
  def getXSDElement(scope: NamespaceBinding): Elem = {
    val xsdPre = scope.getPrefix(XSD_NAMESPACE.toString)
    val isXSDTheDefaultNS = XSD_NAMESPACE.toString() == scope.getURI(null)
    val xsdPrefix =
      if (xsdPre ne null) xsdPre
      else if (isXSDTheDefaultNS) null
      else Assert.usageError("Scope argument must have a binding for the XSD namespace.")
    val res =
      Elem(xsdPrefix, "element", Null, scope, true)
    res
  }

  /**
   * Added to support extensions and proposed future features as part of daffodil.
   *
   * The DFDL standard requires us to keep these out of the primary DFDL namespace, and
   * we really should be using URN-style notation, not http URLs for these.
   * (for why http URLs are a bad idea for these, see:
   * http://www.w3.org/blog/systeam/2008/02/08/w3c_s_excessive_dtd_traffic/ )
   *
   * These definitions must match their XSD counterparts in dafint.xsd and dafext.xsd
   */
  private val DAFFODIL_EXTENSIONS_NAMESPACE_ROOT_NCSA = "urn:ogf:dfdl:2013:imp:opensource.ncsa.illinois.edu:2012"
  private val DAFFODIL_EXTENSION_NAMESPACE_NCSA = NS(DAFFODIL_EXTENSIONS_NAMESPACE_ROOT_NCSA + ":ext")
  val EXT_PREFIX_NCSA = "daf"
  val EXT_NS_NCSA = NS(DAFFODIL_EXTENSION_NAMESPACE_NCSA.uri)

  private val DAFFODIL_NAMESPACE_ROOT_APACHE = "urn:ogf:dfdl:2013:imp:daffodil.apache.org:2018"
  private val DAFFODIL_EXTENSION_NAMESPACE_APACHE = NS(DAFFODIL_NAMESPACE_ROOT_APACHE + ":ext")
  val EXT_PREFIX_APACHE = "daf"
  val EXT_NS_APACHE = NS(DAFFODIL_EXTENSION_NAMESPACE_APACHE.uri)

  private val DAFFODIL_INTERNAL_NAMESPACE = NS(DAFFODIL_NAMESPACE_ROOT_APACHE + ":int")
  val INT_PREFIX = "dafint"
  val INT_NS = NS(DAFFODIL_INTERNAL_NAMESPACE.uri)

  val DAFFODIL_SAX_URN_ROOT: String = DAFFODIL_NAMESPACE_ROOT_APACHE + ":sax"
  val DAFFODIL_SAX_URN_PARSERESULT: String = DAFFODIL_SAX_URN_ROOT + ":ParseResult"
  val DAFFODIL_SAX_URN_BLOBDIRECTORY: String = DAFFODIL_SAX_URN_ROOT + ":BlobDirectory"
  val DAFFODIL_SAX_URN_BLOBPREFIX: String = DAFFODIL_SAX_URN_ROOT + ":BlobPrefix"
  val DAFFODIL_SAX_URN_BLOBSUFFIX: String = DAFFODIL_SAX_URN_ROOT + ":BlobSuffix"

  val SAX_NAMESPACES_FEATURE = "http://xml.org/sax/features/namespaces"
  val SAX_NAMESPACE_PREFIXES_FEATURE = "http://xml.org/sax/features/namespace-prefixes"

  /**
   * Always enable this feature (which disables doctypes).
   */
  val XML_DISALLOW_DOCTYPE_FEATURE = "http://apache.org/xml/features/disallow-doctype-decl"

  /**
   * Always disable this. Might not be necessary if doctypes are disallowed.
   */
  val XML_EXTERNAL_PARAMETER_ENTITIES_FEATURE = "http://xml.org/sax/features/external-parameter-entities"

  /**
   * Always disable this. Might not be necessary if doctypes are disallowed.
   */
  val XML_EXTERNAL_GENERAL_ENTITIES_FEATURE = "http://xml.org/sax/features/external-general-entities"

  /**
   * Always disable this. Might not be necessary if doctypes are disallowed.
   */
  val XML_LOAD_EXTERNAL_DTD_FEATURE = "http://apache.org/xml/features/nonvalidating/load-external-dtd"

  /**
   * Sets properties that disable insecure XML reader behaviors.
   * @param xmlReader - the reader to change feature settings on.
   */
  def setSecureDefaults(xmlReader: XMLReader) : Unit = {
    xmlReader.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true)
    //
    // We don't actually know what FEATURE_SECURE_PROCESSING disables
    // So we also set these individually to their secure settings.
    //
    xmlReader.setFeature(XMLUtils.XML_DISALLOW_DOCTYPE_FEATURE, true)
    xmlReader.setFeature(XMLUtils.XML_EXTERNAL_PARAMETER_ENTITIES_FEATURE, false)
    xmlReader.setFeature(XMLUtils.XML_EXTERNAL_GENERAL_ENTITIES_FEATURE, false)
  }

  val FILE_ATTRIBUTE_NAME = "file"
  val LINE_ATTRIBUTE_NAME = "line"
  val COLUMN_ATTRIBUTE_NAME = "col"

  // shorter forms, to make constructing XML literals,... make the lines shorter.
  val xsdURI = XSD_NAMESPACE
  val dfdlURI = DFDL_NAMESPACE
  val dfdlxURI = DFDLX_NAMESPACE
  val dfdlAppinfoSource = NS("http://www.ogf.org/dfdl/")
  val targetNS = EXAMPLE_NAMESPACE // we use this for tests.
  val xsiURI = XSI_NAMESPACE
  val fnURI = XPATH_FUNCTION_NAMESPACE
  val mathURI = XPATH_MATH_NAMESPACE
  val dafintURI = DAFFODIL_INTERNAL_NAMESPACE

  val DFDL_SIMPLE_BUILT_IN_TYPES =
    List(
      "string",
      "float",
      "double",
      "decimal",
      "integer",
      "long",
      "int",
      "short",
      "byte",
      "unsignedLong",
      "unsignedInt",
      "nonNegativeInteger",
      "unsignedShort",
      "unsignedByte",
      "boolean",
      "date",
      "time",
      "dateTime",
      "hexBinary")

  def slashify(s: String): String = if (s == "" || s.endsWith("/")) s else s + "/"

  /**
   * Annoying, but namespace bindings are never a collection you can process like a normal collection.
   * Instead they are linked by these parent chains.
   */
  def namespaceBindings(nsBinding: NamespaceBinding): Seq[NamespaceBinding] = {
    if (nsBinding == null) Nil
    else {
      val thisOne =
        if (nsBinding.uri != null) List(nsBinding)
        else Nil
      val others = namespaceBindings(nsBinding.parent)
      thisOne ++ others
    }
  }

  /**
   * We don't want to be sensitive to which prefix people bind
   */
  def dfdlAttributes(n: Node) = {
    n.attributes filter {
      _.getNamespace(n) == DFDL_NAMESPACE.toString
    }
  }

  def dfdlxAttributes(n: Node) = {
    n.attributes.filter {
      _.getNamespace(n) == DFDLX_NAMESPACE.toString
    }
  }

  def dafAttributes(n: Node) = {
    n.attributes.filter { a =>
      a.getNamespace(n) == XMLUtils.EXT_NS_NCSA.toString ||
        a.getNamespace(n) == XMLUtils.EXT_NS_APACHE.toString
    }
  }

  /**
   * Used to collapse the excessive xmlns proliferation.
   *
   * If a local scope has bindings in it that are not in the outer scope
   * then a new local scope is created which extends the outer scope.
   *
   * This algorithm is n^2 (or worse) in the length of the outer binding chain (worst case).
   */
  def combineScopes(local: NamespaceBinding, outer: NamespaceBinding): NamespaceBinding = {
    if (local == TopScope) outer
    else {
      val NamespaceBinding(pre, uri, moreBindings) = local
      val outerURI = outer.getURI(pre)
      if (outerURI == uri) {
        // same binding for this prefix in the outer, so we don't need
        // this binding from the local scope.
        combineScopes(moreBindings, outer)
      } else if (outerURI == null) {
        // outer lacks a binding for this prefix
        NamespaceBinding(pre, uri, combineScopes(moreBindings, outer))
      } else {
        // outer has a different binding for this prefix.
        // one would hope that we can just put our superceding binding on the
        // front, but you end up with two bindings for the same prefix
        // in the chain ... and things fail
        //
        // The problem this creates is that it un-shares all the sub-structure
        // of the scopes, and so we no longer have contained elements
        // that share scopes with enclosing parents. That may mean that
        // lots of xmlns:pre="ns" proliferate again even though they're
        // unnecessary.
        //
        val outerWithoutDuplicate = removeBindings(NamespaceBinding(pre, uri, TopScope), outer)
        val moreBindingsWithoutConflict = removeBindings(NamespaceBinding(pre, uri, TopScope), moreBindings)
        NamespaceBinding(pre, uri, combineScopes(moreBindingsWithoutConflict, outerWithoutDuplicate))
      }
    }
  }

  /**
   * remove all the binding s
   */
  def removeBindings(nb: NamespaceBinding, scope: NamespaceBinding): NamespaceBinding = {
    if (nb == TopScope) scope
    else if (scope == TopScope) scope
    else {
      val NamespaceBinding(pre, _, more) = scope
      if (nb.getURI(pre) != null) {
        // the scope has a binding for this prefix
        // so irrespective of the uri, we remove it.
        removeBindings(nb, more)
      } else {
        // no binding, so keep it
        scope.copy(parent = removeBindings(nb, more))
      }
    }
  }

  def combineScopes(prefix: String, ns: NS, outer: NamespaceBinding): NamespaceBinding = {
    if (ns.optURI.isEmpty) {
      outer
    } else {
      val uri = ns.optURI.get.toString
      val inner = NamespaceBinding(prefix, uri, TopScope)
      combineScopes(inner, outer)
    }
  }

  def collapseScopes(x: Node, outer: NamespaceBinding): Node = {
    x match {
      case Elem(pre, lab, md, scp, child @ _*) => {
        val newScope = combineScopes(scp, outer)
        Elem(pre, lab, md, newScope, true, (child flatMap { ch => collapseScopes(ch, newScope) }): _*)
      }
      case _ => x
    }
  }

  /**
   * Removes NamespaceBindings from a scope containing specified namespaces
   */
  def filterScope(nsb: NamespaceBinding, nss: Seq[NS]): NamespaceBinding = {
    val newHead =
      if (nsb == xml.TopScope) {
        xml.TopScope
      } else {
        val parentCopy = filterScope(nsb.parent, nss)
        if (nss.contains(NS(nsb.uri))) {
          parentCopy
        } else {
          nsb.copy(parent = parentCopy)
        }
      }
    newHead
  }

  /**
   * Determines if a prefix is defined inside a scope
   */
  def prefixInScope(prefix: String, scope: NamespaceBinding): Boolean = {
    val ret =
      if (scope == null) {
        false
      } else if (prefix == scope.prefix) {
        true
      } else {
        prefixInScope(prefix, scope.parent)
      }
    ret
  }

  /**
   * Remove Comments
   */

  def removeComments(e: Node): Node = {
    e match {
      case Elem(prefix, label, attribs, scope, child @ _*) => {
        val newChildren = child.filterNot { _.isInstanceOf[Comment] }.map { removeComments(_) }
        Elem(prefix, label, attribs, scope, true, newChildren: _*)
      }
      case x => x
    }
  }

  /**
   * Removes attributes that we want to ignore when comparing
   * infosets.
   *
   * Removes dafint namespace attributes such as dafint:line and dafint:col.
   *
   * If a sequence of namespaces are given, only those attributes and scopes in
   * those namepsaces are removed. Otherwise, all attributes and scopes (aside
   * from special ones like xsi:nil) are removed. Additionally, if a scope is
   * filtered, the prefixes of elements prefixed with filtered scopes are also
   * removed.
   *
   * If a scope is given, it will be used for a child element if the
   * childs filtered scope is the same as the scope.
   *
   * Also strips out comments and mixed whitespace nodes, and coalesces
   * adjacent text nodes.
   *
   * Throws an exception if it contains mixed non-whitespace nodes.
   */
  def removeAttributes(n: Node, ns: Seq[NS] = Seq[NS](), parentScope: Option[NamespaceBinding] = None): Node = {
    val res1 = removeAttributes1(n, ns, parentScope).asInstanceOf[scala.xml.Node]
    val res2 = removeMixedWhitespace(res1)
    val res = res2(0) // .asInstanceOf[scala.xml.Node]
    res
  }

  /**
   * removes insignificant whitespace from between elements
   */

  private def removeMixedWhitespace(ns: Node): Node = {
    if (!ns.isInstanceOf[Elem]) return ns
    val e = ns.asInstanceOf[Elem]
    val children = e.child
    val noMixedChildren =
      if (children.exists(_.isInstanceOf[Elem])) {
        children.filter {
          case Text(data) if data.matches("""\s*""") => false
          case Text(data) => throw new Exception("Element %s contains mixed data: %s".format(e.label, data))
          case _ => true
        }.map(removeMixedWhitespace)
      } else {
        children.filter {
          //
          // So this is a bit strange, but we're dropping nodes that are Empty String.
          //
          // In XML we cannot tell  where there is a Text("") child, from  with Nil children
          //
          case Text("") => false // drop empty strings
          case _ => true
        }
      }

    val res =
      if (noMixedChildren eq children) e
      else e.copy(child = noMixedChildren)
    res
  }

  /**
   * Used as part of preparing XML for comparison/diffing.
   *
   * Insures that CDATA bracketing of data doesn't change
   * the value of text for comparison purposes.
   */
  private def convertPCDataToText(n: Node): Node = {
    val res = n match {
      case t: Text => t
      case a: Atom[_] => Text(a.text)
      case Elem(prefix, label, attributes, scope, children @ _*) => {
        val newChildren = children.map { convertPCDataToText(_) }
        Elem(prefix, label, attributes, scope, true, newChildren: _*)
      }
      case _ => n
    }
    res
  }

  private def removeAttributes1(n: Node, ns: Seq[NS], parentScope: Option[NamespaceBinding]): NodeSeq = {
    val res = n match {

      case e @ Elem(prefix, label, attributes, scope, children @ _*) => {

        val filteredScope = if (ns.length > 0) filterScope(scope, ns) else xml.TopScope

        // If the filtered scope is logically the same as the parent scope, use
        // the parent scope. Scala uses references to determine if scopes are
        // the same during pretty printing. However, scopes are immutable, so
        // the filter algorithm creates new scopes. Because of this, we need to
        // ignore the newly filtered scope if it is logically the same as the
        // parent so that the scala pretty printer doesn't see them as
        // different scopes.
        val newScope = parentScope match {
          case Some(ps) => if (ps == filteredScope) ps else filteredScope
          case None => filteredScope
        }

        val newChildren: NodeSeq = children.flatMap { removeAttributes1(_, ns, Some(newScope)) }

        // Important to merge adjacent text. Otherwise when comparing
        // two structures that print out the same, they might not be equal
        // because they have different length lists of text nodes
        //
        // Ex: A creates an element containing TWO
        // text nodes. But coming from the Daffodil Infoset, a string like
        // that would be just one text node.
        // Similarly abcghi has 3 child nodes.
        // The middle one is PCData. The two around it are Text.
        // Both Text and PCData are Atom[String].

        // Note: as of 2018-04-30, Mike Beckerle said: I am unable to reproduce the above.
        // The first example: A.child returns an array buffer with 1 child in it
        // which is a Text node. The abcghi also has only
        // one Text node.  That said, this is from typing them at the scala shell.
        //
        // I suspect the above observations require that the scala.xml.parsing.ConstructingParser
        // is used. We do use this, because while the regular XML loader coalesces
        // text nodes well, but doesn't preserve whitespace for CDATA regions well. That's why we use the
        // scala.xml.parser.ConstructingParser, which doesn't coalesce text nodes
        // so well, and that's what motivates this explicit coalesce pass.
        //
        // See test test_scala_loader_cdata_bug - which characterizes the behavior
        // that is problematic for us in the standard loader, and why we have to use
        // the ConstructingParser.
        //
        val textMergedChildren = coalesceAdjacentTextNodes(newChildren)

        val newPrefix = if (prefixInScope(prefix, newScope)) prefix else null

        val newAttributes = attributes.filter { m =>
          m match {
            case xsiNilAttr @ PrefixedAttribute(_, "nil", Text("true"), _) if (NS(xsiNilAttr.getNamespace(e)) == XMLUtils.XSI_NAMESPACE) => {
              true
            }
            //
            // This tolerates xsi:nil='true' when xsi has no definition at all.
            case xsiNilAttr @ PrefixedAttribute("xsi", "nil", Text("true"), _) if (xsiNilAttr.getNamespace(e) == null) => {
              true
            }
            case dafIntAttr @ PrefixedAttribute(pre, _, _, _) if (pre ne null) && (dafIntAttr.getNamespace(e) == XMLUtils.DAFFODIL_INTERNAL_NAMESPACE.toString) => {
              Assert.invariant(pre != "")
              false // drop dafint attributes.
            }
            case xsiTypeAttr @ PrefixedAttribute(_, "type", _, _) if (NS(xsiTypeAttr.getNamespace(e)) == XMLUtils.XSI_NAMESPACE) => {
              // TODO: actually check xsi:type attributes are correct according
              // to the schema--requires schema-awareness in TDML Runner.
              // Do not hide xsi:type since it is used for hints for type aware
              // comparisons.
              true
            }
            case xsiTypeAttr @ PrefixedAttribute("xsi", "type", _, _) => {
              // TODO: actually check xsi:type attributes are correct according
              // to the schema--requires schema-awareness in TDML Runner.
              // Do not hide xsi:type since it is used for hints for type aware
              // comparisons.
              true
            }
            case attr =>
              true // keep all other attributes
          }
        }

        Elem(newPrefix, label, newAttributes, newScope, true, textMergedChildren: _*)
      }
      case c: scala.xml.Comment => NodeSeq.Empty // remove comments
      case other => other
    }
    res
  }

  /**
   * Prepares an XML node for diff comparison
   */
  private def prepareForDiffComparison(n: Node): Node = {
    val noComments = removeComments(n)
    val noPCData = convertPCDataToText(noComments)
    val combinedText = coalesceAllAdjacentTextNodes(noPCData)
    val noMixedWS = removeMixedWhitespace(combinedText)
    noMixedWS
  }

  class XMLDifferenceException(message: String) extends Exception(message)

  def compareAndReport(
    expected: Node,
    actual: Node,
    ignoreProcInstr: Boolean = true,
    checkPrefixes: Boolean = false,
    checkNamespaces: Boolean = false): Unit = {
    val expectedMinimized = prepareForDiffComparison(expected)
    val actualMinimized = prepareForDiffComparison(actual)
    val diffs = XMLUtils.computeDiff(
      expectedMinimized,
      actualMinimized,
      ignoreProcInstr,
      checkPrefixes,
      checkNamespaces)
    if (diffs.length > 0) {
      throw new XMLDifferenceException("""
Comparison failed.
Expected (attributes %s)
          %s
Actual (attributes %s for diff)
          %s
Differences were (path, expected, actual):
%s""".format(
        (if (checkPrefixes || checkNamespaces) "compared for diff"
        else "stripped"),
        (if (checkPrefixes || checkNamespaces) expected
        else removeAttributes(expected).toString),
        (if (checkPrefixes || checkNamespaces) "compared"
        else "ignored"),
        actual,
        diffs.map { _.toString }.mkString("- ", "\n- ", "\n")))
    }
  }

  /**
   * computes a precise difference list which is a sequence of triples.
   * Each triple is the path (an x-path-like string), followed by expected, and actual values.
   */
  def computeDiff(
    a: Node,
    b: Node,
    ignoreProcInstr: Boolean = true,
    checkPrefixes: Boolean = false,
    checkNamespaces: Boolean = false) = {
    computeDiffOne(
      a,
      b,
      TopScope,
      TopScope,
      None,
      Nil,
      ignoreProcInstr,
      checkPrefixes,
      checkNamespaces,
      None)
  }

  def childArrayCounters(e: Elem) = {
    val Elem(_, _, _, _, children @ _*) = e
    val labels = children.map { _.label }
    val groups = labels.groupBy { x => x }
    val counts = groups.map { case (label, labelList) => (label, labelList.length) }
    val arrayCounts = counts.filter { case (label, 1) => false; case _ => true } // remove counters for scalars
    val arrayCounters = arrayCounts.map { case (label, _) => (label, 1.toLong) } // 1 based like XPath!
    arrayCounters
  }

  def computeDiffOne(
    an: Node,
    bn: Node,
    aParentScope: NamespaceBinding,
    bParentScope: NamespaceBinding,
    maybeIndex: Option[Int],
    parentPathSteps: Seq[String],
    ignoreProcInstr: Boolean,
    checkPrefixes: Boolean,
    checkNamespaces: Boolean,
    maybeType: Option[String]): Seq[(String, String, String)] = {
    lazy val zPath = parentPathSteps.reverse.mkString("/")
    (an, bn) match {
      case (a: Elem, b: Elem) => {
        val Elem(prefixA, labelA, attribsA, nsbA, childrenA @ _*) = a
        val Elem(prefixB, labelB, attribsB, nsbB, childrenB @ _*) = b
        val typeA: Option[String] = a.attribute(XSI_NAMESPACE.toString, "type").map(_.head.text)
        val typeB: Option[String] = b.attribute(XSI_NAMESPACE.toString, "type").map(_.head.text)
        val maybeType: Option[String] = Option(typeA.getOrElse(typeB.getOrElse(null)))
        val nilledA = a.attribute(XSI_NAMESPACE.toString, "nil")
        val nilledB = b.attribute(XSI_NAMESPACE.toString, "nil")
        val mappingsA = if (checkNamespaces) nsbA.buildString(aParentScope).trim else ""
        val mappingsB = if (checkNamespaces) nsbB.buildString(bParentScope).trim else ""

        if (labelA != labelB) {
          // different label
          List((zPath, labelA, labelB))
        } else if (checkPrefixes && prefixA != prefixB) {
          // different prefix
          List((zPath + "/" + labelA + "@prefix", prefixA, prefixB))
        } else if (checkNamespaces && mappingsA != mappingsB) {
          // different namespace bindings
          List((zPath + "/" + labelA + "@xmlns", mappingsA, mappingsB))
        } else if (nilledA != nilledB) {
          // different xsi:nil
          List((zPath + "/" + labelA + "@xsi:nil",
            nilledA.map(_.toString).getOrElse(""),
            nilledB.map(_.toString).getOrElse("")))
        } else if (typeA != typeB && typeA.isDefined && typeB.isDefined) {
          // different xsi:type (if both suppplied)
          List((zPath + "/" + labelA + "@xsi:type",
            typeA.map(_.toString).getOrElse(""),
            typeA.map(_.toString).getOrElse("")))
        } else {
          val pathLabel = labelA + maybeIndex.map("[" + _ + "]").getOrElse("")
          val thisPathStep = pathLabel +: parentPathSteps

          val (childrenACompare, childrenBCompare) =
            if (ignoreProcInstr) {
              val ca = childrenA.filterNot(_.isInstanceOf[ProcInstr])
              val cb = childrenB.filterNot(_.isInstanceOf[ProcInstr])
              (ca, cb)
            } else {
              (childrenA, childrenB)
            }

          // for elements with repeats we want to use an index in any diff
          // outut. So for repeating children, we'll create a mutable map where
          // the key is the label and the value is the count of how many
          // children of that label we've seen
          val repeatingChildrenLabels = childrenA.groupBy(_.label).filter { case (k,v) => v.length > 1 }.keys
          val labelsWithZeroCount = repeatingChildrenLabels.map { _ -> 0 }
          val countMap = mutable.Map(labelsWithZeroCount.toSeq: _*)

          val childrenDiffs = childrenACompare.zip(childrenBCompare).flatMap { case (ca, cb) =>
            val maybeChildCount = countMap.get(ca.label)
            val maybeChildIndex = maybeChildCount.map { count =>
              countMap(ca.label) += 1
              count + 1
            }
            computeDiffOne(
              ca,
              cb,
              nsbA,
              nsbB,
              maybeChildIndex,
              thisPathStep,
              ignoreProcInstr,
              checkPrefixes,
              checkNamespaces,
              maybeType)
          }

          // if childrenA and childrenB have different length, zip will drop an
          // extra. This will report a diff if the lengths are off.
          val childrenLengthDiff =
            if (childrenA.length != childrenB.length) {
              List((zPath + "/" + labelA + "::child@count)",
                childrenA.length.toString,
                childrenB.length.toString))
            } else {
              Nil
            }

          childrenDiffs ++ childrenLengthDiff
        }
      }
      case (tA: Text, tB: Text) => {
        val thisDiff = computeTextDiff(zPath, tA, tB, maybeType)
        thisDiff
      }
      case (pA: ProcInstr, pB: ProcInstr) => {
        val ProcInstr(tA1label, tA1content) = pA
        val ProcInstr(tB1label, tB1content) = pB
        val labelDiff = computeTextDiff(zPath, tA1label, tB1label, None)
        //
        // The content of a ProcInstr is technically a big string
        // But our usage of them the content is XML-like so could be loaded and then compared
        // as XML, if the label is in fact an indicator that this is our special
        // PI with format info.
        //
        // Much of that XML-ish content is attributes however, so we need to be sure
        // we're comparing those too.
        //
        // TODO: implement XML-comparison for our data format info PIs.
        //
        val contentDiff = computeTextDiff(zPath, tA1content, tB1content, maybeType)
        labelDiff ++ contentDiff
      }
      case _ => {
        List((zPath, an.toString, bn.toString))
      }
    }
  }

  def computeTextDiff(
    zPath: String,
    tA: Text,
    tB: Text,
    maybeType: Option[String]): Seq[(String, String, String)] = {

    val dataA = tA.toString
    val dataB = tB.toString
    computeTextDiff(zPath, dataA, dataB, maybeType)
  }

  def computeBlobDiff(zPath: String, dataA: String, dataB: String) = {
    val uriA = Misc.searchResourceOption(dataA, None)
    val uriB = Misc.searchResourceOption(dataB, None)

    val pathA = uriA.map { u => Paths.get(u) }
    val pathB = uriB.map { u => Paths.get(u) }

    val canReadA = pathA.map { p => Files.isReadable(p) }.getOrElse(false)
    val canReadB = pathB.map { p => Files.isReadable(p) }.getOrElse(false)

    if (!canReadA || !canReadB) {
      val path = zPath + ".canRead"
      Seq((path, canReadA.toString, canReadB.toString))
    } else {
      val CHUNK_SIZE = 1024
      val arrayA = new Array[Byte](CHUNK_SIZE)
      val arrayB = new Array[Byte](CHUNK_SIZE)

      val streamA = Files.newInputStream(pathA.get, StandardOpenOption.READ)
      val streamB = Files.newInputStream(pathB.get, StandardOpenOption.READ)

      var lenA: Int = 0
      var lenB: Int = 0
      var numSameBytes: Int = 0
      var areSame: Boolean = true

      while ({
          lenA = streamA.read(arrayA)
          lenB = streamB.read(arrayB)
          areSame = lenA == lenB && arrayA.sameElements(arrayB)
          areSame && lenA != -1 && lenB != -1
      }) {
        numSameBytes += lenA
      }

      if (!areSame) {
        val zip = arrayA.zip(arrayB)
        val firstDiffIndex = zip.indexWhere(z => z._1 != z._2)

        val MAX_CONTEXT = 40
        val contextA = arrayA.take(lenA).drop(firstDiffIndex).take(MAX_CONTEXT)
        val contextB = arrayB.take(lenB).drop(firstDiffIndex).take(MAX_CONTEXT)
        val hexA = Misc.bytes2Hex(contextA)
        val hexB = Misc.bytes2Hex(contextB)

        val absoluteIndex = numSameBytes + firstDiffIndex
        val path = zPath + ".bytesAt(" + (absoluteIndex + 1) + ")"
        Seq((path, hexA, hexB))
      } else {
        Nil
      }
    }
  }

  def computeTextDiff(
    zPath: String,
    dataA: String,
    dataB: String,
    maybeType: Option[String]): Seq[(String, String, String)] = {

    val hasBlobType = maybeType.isDefined && maybeType.get == "xs:anyURI"
    val dataLooksLikeBlobURI = Seq(dataA, dataB).forall(_.startsWith("file://"))
    if (hasBlobType || dataLooksLikeBlobURI) computeBlobDiff(zPath, dataA, dataB)
    else if (textIsSame(dataA, dataB, maybeType)) Nil
    else {
      // There must be some difference, so let's find just the first index of
      // difference and we'll include that and some following characters for
      // context.
      val CHARS_TO_SHOW_AFTER_DIFF = 40

      val lenA = dataA.length
      val lenB = dataB.length
      var index = 0
      while (index < lenA && index < lenB && dataA(index) == dataB(index)) {
        index += 1
      }

      // We know there must be a diff once we got here. Either dataA/dataB is a
      // prefix of the other and index is where the prefix ends, or index is
      // the first difference found. Either way, we can safely use slice() to
      // get at most some number of characters at that index for context.
      val contextA = Misc.remapControlsAndLineEndingsToVisibleGlyphs(dataA.slice(index, index + CHARS_TO_SHOW_AFTER_DIFF))
      val contextB = Misc.remapControlsAndLineEndingsToVisibleGlyphs(dataB.slice(index, index + CHARS_TO_SHOW_AFTER_DIFF))
      val path = zPath + ".charAt(" + (index + 1) + ")"
      Seq((path, contextA, contextB))
    }
  }

  def textIsSame(dataA: String, dataB: String, maybeType: Option[String]): Boolean = {
    maybeType match {
      case Some("xs:hexBinary") => dataA.equalsIgnoreCase(dataB)
      case Some("xs:date") => {
        val a = DFDLDateConversion.fromXMLString(dataA)
        val b = DFDLDateConversion.fromXMLString(dataB)
        a == b
      }
      case Some("xs:time") => {
        val a = DFDLTimeConversion.fromXMLString(dataA)
        val b = DFDLTimeConversion.fromXMLString(dataB)
        a == b
      }
      case Some("xs:dateTime") => {
        val a = DFDLDateTimeConversion.fromXMLString(dataA)
        val b = DFDLDateTimeConversion.fromXMLString(dataB)
        a == b
      }
      case _ => dataA == dataB
    }
  }

  /**
   * for quick tests, we use literal scala nodes. However, the underlying
   * infrastructure wants to be all file centric for diagnostic-message
   * reasons (line numbers for errors)
   */
  def convertNodeToTempFile(xml: Node, tmpDir: File, nameHint: String = "daffodil_tmp_") = {
    val prefix = prefixFromHint(nameHint)
    val tmpSchemaFile = File.createTempFile(prefix, ".dfdl.xsd", tmpDir)
    // Delete temp file when program exits
    tmpSchemaFile.deleteOnExit
    //
    // Note: we use our own pretty printer here because
    // Scala library one doesn't preserve/print CDATA properly.
    //
    val pp = new org.apache.daffodil.xml.PrettyPrinter(2)
    val xmlString = pp.format(xml)
    val fos = new java.io.FileOutputStream(tmpSchemaFile)
    val fw = new java.io.OutputStreamWriter(fos, StandardCharsets.UTF_8)
    fw.write(xmlString)
    fw.close()
    tmpSchemaFile
  }

  /**
   * Create a suitable prefix for a temp file name.
   *
   * @param nameHint a string incorporated into the prefix
   * @return the prefix string which has minimum length 3.
   */
  private def prefixFromHint(nameHint: String) = {

    nameHint.length match {
      case 0 => "daffodil_tmp_"
      case 1 => nameHint + "__"
      case 2 => nameHint + "_"
      case _ => nameHint
    }
  }

  def convertInputStreamToTempFile(
    is: java.io.InputStream,
    tmpDir: File,
    nameHint: String,
    suffix: String) = {
    val prefix = prefixFromHint(nameHint)
    val tmpSchemaFile = File.createTempFile(prefix, suffix, tmpDir)
    // Delete temp file when program exits
    tmpSchemaFile.deleteOnExit

    val fos = new java.io.FileOutputStream(tmpSchemaFile)
    IOUtils.copy(is, fos)
    fos.close()
    tmpSchemaFile
  }

  /**
   * Strong escaping that never loses information, handles apos and CR right.
   *
   * Escapes apostrophe (single quote) as well as the other XML escaped chars.
   * Remaps CR and any other XML-illegals into PUA. Replaces whitespace with
   * numeric character entities for additional safety.
   *
   * This is needed since XML may be using single quotes to surround a string which
   * might contain single quotes.
   *
   * The reason basic scala.xml.Utility.escape doesn't escape single-quotes is
   * HTML compatibility. HTML doesn't define an "&apos;" entity.
   *
   * Furthermore, since some potentially illegal XML characters may be used here, we
   * are going to remap all the illegal XML characters to their corresponding PUA characters.
   *
   * Lastly, all whitespace chars are replaced by numeric character entities, and
   * anything above 0xFF that is not considered letter or digit, is also replaced
   * by a numeric character entity.
   *
   * The result is a string which can be displayed as an XML attribute value, is
   * invertible back to the original string.
   *
   * Finally, CRLF will come through as "&#xE00D;&#xA;", and isolated CR
   * will come through as "&#xE00D;". That's because
   * if we used "&#xD;" for the CR, it might be converted to a LF by XML readers.
   * Not all XML readers/loaders do this, but it is described as standard behavior
   * in the XML specification.
   * We have to use our own PUA remapping trick if we want to be sure to preserve
   * CR in XML.
   */
  def escape(str: String, sb: StringBuilder = new StringBuilder()): StringBuilder = {
    var i = 0
    while (i < str.length) {
      val x = str(i)
      val c = escapeMapper(x)
      i += 1
      c match {
        case '\'' => sb.append("'") // don't use "'" because it's not universally accepted (HTML doesn't have it in early versions)
        case '"' => sb.append(""")
        case '&' => sb.append("&")
        case '<' => sb.append("<")
        case '>' => sb.append(">")
        case _ if (c.isLetterOrDigit) => sb.append(c)
        case _ if (c.isWhitespace || c.isControl) => toNumericCharacterEntity(c, sb)
        // A0 is the NBSP character - not considered whitespace, but no glyph, so we need it numeric
        case _ if (c.toInt == 0xA0) => toNumericCharacterEntity(c, sb)
        // Any other char < 256 is punctuation or other glyph char
        case _ if (c.toInt < 0xFF) => sb.append(c)
        case _ => toNumericCharacterEntity(c, sb)
      }
    }
    sb
  }

  private val escapeMapper =
    remapXMLIllegalCharToPUA(
      checkForExistingPUA = false,
      replaceCRWithLF = false) _

  def toNumericCharacterEntity(c: Char, sb: StringBuilder) = {
    val i = c.toInt
    Assert.usage(i > 0) // NUL cannot be represented at all in XML.
    val s = Integer.toHexString(i).toUpperCase()
    sb.append("&#x")
    sb.append(s)
    sb.append(";")
  }

  private val xmlEntityPattern = new Regex("""&(quot|amp|apos|lt|gt);""", "entity")

  /**
   * Remove XML escapes like & and > < from a string.
   */
   def unescape(raw: String) = {
    val withoutNamedXMLCharEntities: String = {
      val res = xmlEntityPattern.replaceAllIn(raw, m => {
        val sb = scala.xml.Utility.unescape(m.group("entity"), new StringBuilder())
        // There really is no possibility for null to come back as we've made
        // sure to only include valid xml entities in the xmlEntityPattern.
        Assert.invariant(sb ne null)
        sb.toString()
      })
      res
    }
    withoutNamedXMLCharEntities
  }

  /**
   * Return a Maybe(URI) from NamespaceBinding based on some input prefix. There is a
   * NamespaceBinding equivalent of this called getURI, but that does not handle the null case
   * and will throw a NullPointerException when uri can't be found
   *
   * @param nsb NamespaceBinding we wish to search for the prefix's uri
   * @param prefix Prefix whose URI we search through the NamespaceBinding for
   * @return the uri string wrapped in a Maybe.One, or Maybe.Nope, if not found
   */
  @tailrec
def maybeURI(nsb: NamespaceBinding, prefix: String): Maybe[String] = {
    if (nsb == null) Maybe.Nope
    else if (nsb.prefix == prefix) Maybe.One(nsb.uri)
    else maybeURI(nsb.parent, prefix)
  }

  /**
   * Return Maybe(prefix) from NamespaceBinding based on some input uri. There is a
   * NamespaceBinding equivalent of this called gePrefix, but that does not handle the null case
   * and will throw a NullPointerException when uri can't be found
   *
   * @param nsb NamespaceBinding we wish to search for the uri's prefix
   * @param uri Prefix whose URI we search through the NamespaceBinding for
   * @return the prefix string wrapped in a Maybe.One, or Maybe.Nope if not found or prefix is null
   */
  @tailrec
  def maybePrefix(nsb: NamespaceBinding, uri: String): Maybe[String] = {

    if (nsb == null) Maybe.Nope
    else if (nsb.uri == uri) {
      if (nsb.prefix == null) {
        // the case of xmlns="some-uri"
        Maybe.Nope
      } else {
        Maybe.One(nsb.prefix)
      }
    }
    else maybePrefix(nsb.parent, uri)
  }
}

trait GetAttributesMixin extends ThrowsSDE {
  def xml: Node

  /**
   * Use to retrieve things that are not format properties.
   */
  def getAttributeRequired(name: String) = {
    getAttributeOption(name) match {
      case None => schemaDefinitionError("The attribute '" + name + "' is required.")
      case Some(s) => s
    }
  }

  /**
   * Use to retrieve things that are not format properties.
   */
  def getAttributeOption(name: String): Option[String] = {
    val attrString = xml.attribute(name).map { _.text }
    attrString
  }

  def getAttributeOption(ns: NS, name: String): Option[String] = {
    //
    // Most annoying, but this doesn't work....
    // val res = xml.attribute(ns.toString, name).map{ _.text }
    val attr = (xml \ ("@{" + ns.toString + "}" + name))
    if (attr.length == 0) None
    else Some(attr.text)
  }

  /**
   * For picking off the short-form annotations.
   */
  def dfdlAttributes(n: Node) = XMLUtils.dfdlAttributes(n)
  def dfdlxAttributes(n: Node) = XMLUtils.dfdlxAttributes(n)
  def dafAttributes(n: Node) = XMLUtils.dafAttributes(n)

}

class QNamePrefixNotInScopeException(pre: String, loc: LookupLocation)
  extends Exception("Prefix " + pre + " not found in scope. Location: " + loc.toString)

// Commented out for now, but we may reactivate this to
// do more validation stuff in the TDMLRunner. So keeping in the
// source like this.
//
//object XMLSchemaUtils {
//  /**
//   * validate a DFDL schema.
//   *
//   * This validates the XML Schema language subset that DFDL uses, and also all the annotations
//   * hung off of it.
//   */
//  def validateDFDLSchema(doc: Node) = {
//    // TODO: should this do something other than throw an exception on a validation error?
//    //
//    // Users will write DFDL Schemas, using the xs or xsd prefix (usually) bound to the XML Schema namespace,
//    // and the dfdl prefix (usually) bound to the DFDL namespace.
//    //
//    // However, we don't want to validate using the XML Schema for XML Schema (which would be the usual interpretation
//    // of validating an XML Schema), instead we want to use the schema for the DFDL Subset of XML Schema.
//    //
//    // So, the hack here, is we're going to textually substitute the URIs, so that the validator doesn't have to be
//    // modified to do this switch, and we don't have to lie in the DFDL Subset schema, and claim it is realizing the
//    // XML Schema URI.
//    //
//    // However, we should consider whether there is a better way to do this involving either (a) lying and having the
//    // DFDL Subset Schema pretend it is the XSD schema, or we can play some catalog tricks perhaps.
//    //
//    // Also, the way this whole thing finds the necessary schemas is a bit daft. It should look in the jar or files,
//    // but it should be using an XML Catalog.
//    //
//    val docstring = doc.toString()
//    val xmlnsURI = "http://www.w3.org/2001/XMLSchema";
//    val xsdSubsetURI = "http://www.w3.org/2001/XMLSchema";
//    val docReplaced = docstring.replaceAll(xmlnsURI, xsdSubsetURI)
//    val docReader = new StringReader(docReplaced)
//    val schemaResource = Misc.getRequiredResource(Validator.dfdlSchemaFileName()).toURI()
//    val res =
//      try {
//        Validator.validateXMLStream(schemaResource, docReader)
//      } catch {
//        case e: ...  => {
//          val exc = e
//          // System.err.println(exc.getMessage())
//          // Really useful place for a breakpoint.
//          throw e
//        }
//      }
//    res
//  }
//}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy