All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.daffodil.xml.DaffodilConstructingLoader.scala Maven / Gradle / Ivy

There is a newer version: 3.9.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.daffodil.xml

import java.io.BufferedInputStream
import java.net.URI

import scala.io.Source
import scala.xml._
import scala.xml.include.sax.EncodingHeuristics
import scala.xml.parsing.ConstructingParser

import org.apache.daffodil.exceptions.Assert

/**
 * Scala 2.11 deprecated the Position object so it is no longer public.
 * However, we still need a way to decode the integer positions that contain
 * both line/col information. Scala provided no other way to decode this, so
 * this copies the relevant bits from:
 *
 * https://github.com/scala/scala/blob/2.11.x/src/library/scala/io/Position.scala
 *
 * Note that if scala ever changes thes values, line/column numbers will be off
 */
object Position {
  /** Number of bits used to encode the line number */
  final val LINE_BITS = 20
  /** Number of bits used to encode the column number */
  final val COLUMN_BITS = 31 - LINE_BITS // no negatives => 31
  /** Mask to decode the line number */
  final val LINE_MASK = (1 << LINE_BITS) - 1
  /** Mask to decode the column number */
  final val COLUMN_MASK = (1 << COLUMN_BITS) - 1

  final def line(pos: Int): Int = (pos >> COLUMN_BITS) & LINE_MASK

  final def column(pos: Int): Int = pos & COLUMN_MASK
}

/**
 * Loads XML using the Scala ConstructingParser for XML.
 *
 * Necessary as this xml loading technique handles the 
 * properly, creating PCData nodes for the contents of these, and not otherwise
 * messing with the contents.
 *
 * This code is effectively our fork of the Scala ConstructingParser. This
 * works around some bugs in it.
 *
 * Xerces, unfortunately, messes with the contents of these CDATA regions,
 * normalizes whitespace inside them, and generally makes it impossible to do things
 * in XML that depend on line-structure of element content to being preserved.
 *
 * We have places where line structure matters. Specifically regular expressions
 * have a free-form syntax with comments that extend to end-of-line. If we always
 * wrap these with CDATA, and use this loader, not Xerces, then these will be
 * preserved properly.
 *
 * Also, enhanced so that when addPositionAttributes is true, it will capture
 * file/line/column info for every element and add it
 * as attributes onto each XML element.
 *
 * The way the constructing loader (aka ConstructingParser (for XML))
 * gets positions is different. It is given just an offset into the document file/stream,
 * and it therefore must synthesize line number/col number info itself.
 *
 * This primary constructor is package private as the normalizeCRLFtoLF feature
 * is only for test/exploratory usage, or if a future need arises to preserve the
 * non-normalizaing behavior.
 *
 * @param uri URI for the XML to be loaded.
 * @param errorHandler Called back on load errors.
 * @param addPositionAttributes Use true if you want dafint:file,
 *                              dafint:col, and dafint:line attributes.
 *                              Defaults to false.
 * @param normalizeCRLFtoLF Use true to emulate the scala XML load
 *                          behavior of normalizing CRLF to LF, and solitary CR to LF.
 *                          Defaults to true. Should only be changed in special circumstances
 *                          as not normalizing CRLFs is non-standard for XML.
 *
 */
class DaffodilConstructingLoader private[xml] (uri: URI,
  errorHandler: org.xml.sax.ErrorHandler,
  addPositionAttributes: Boolean,
  normalizeCRLFtoLF: Boolean)
  extends ConstructingParser({
    // Note: we must open the XML carefully since it might be in some non
    // default encoding (we have tests that have UTF-16 for example)

    // must be buffered to support mark(), needed by heuristics
    val is = new BufferedInputStream(uri.toURL.openStream())
    val enc = EncodingHeuristics.readEncodingFromStream(is)
    Source.fromInputStream(is, enc)
  }, true) {

  /**
   * Public constructor insists on normalizingCRLFtoLF behavior.
   */
  def this (uri: URI,
    errorHandler: org.xml.sax.ErrorHandler,
    addPositionAttributes: Boolean = false) =
    this(uri, errorHandler, addPositionAttributes, normalizeCRLFtoLF = true)

  /**
   * Ensures that DOCTYPES aka DTDs, if encountered, are rejected.
   *
   * Coverage is off, because this should never be hit, because
   * the loader always has loaded the data with xerces prior to
   * this loader (for validation purposes), and that will have caught
   * the doctype being in the XML.
   *
   * However, under code maintenance, suppose someone turned that off
   * or made that pass optional (for performance reasons perhaps). Then this
   * provides a last-gasp attempt to protect from DOCTYPE-related
   * insecurity.
   */
  // $COVERAGE-OFF$
  override def parseDTD(): Unit = {
    val e = makeSAXParseException(pos, "DOCTYPE is disallowed.")
    throw e
  }
  // $COVERAGE-ON$

  // This one line is a bit of a hack to get consistent line numbers. The
  // scala-xml libary reads XML from a scala.io.Source which maintains private
  // line/col information about where in the Source we are reading from (i.e.
  // scala.io.Source.pos). The problem is that when CDATA or a processing
  // instruction is encountered, the library switches to a custom
  // "WithLookAhead" scala.io.Source that buffers the original Source. This
  // lookahead Source allows it to peek ahead a few characters, which is used
  // to find the end of CDATA and processing instructions. The problem is that
  // when it switches to this new Source, we lose position information since
  // that information is private to each Source. This causes line information
  // to reset to zero when the first CDATA or processing instruction is found.
  // And there is no good way to copy position information from one source to
  // another. So, what we can do is call this lookahead() function before any
  // XML is parsed. This causes the ConstructingLoader to immediately switch to
  // the buffering source. There may be some slight overhead for buffering, but
  // at last our line numbers are correct.
  lookahead()


  private def makeSAXParseException(pos: Int, msg: String) = {
    val line = Position.line(pos)
    val col = Position.column(pos)
    val exc = new org.xml.sax.SAXParseException(msg, null, uri.toString, line, col)
    exc
  }

  override def reportSyntaxError(pos: Int, msg: String): Unit = {
    val exc = makeSAXParseException(pos, msg)
    errorHandler.error(exc)
  }

  /*
   * Callback method invoked by MarkupParser after parsing an element, between
   * the elemStart and elemEnd callbacks. This adds daffodil file/line/column
   * information as attributes to the existing input attrs, modifying the scope
   * if necessary, then creates an element using the super def elem function.
   *
   *  @param pos      the position in the source file
   *  @param pre      the prefix
   *  @param local    the local name
   *  @param attrs    the attributes (metadata)
   *  @param scope    the namespace binding scope
   *  @param empty    `true` if the element was previously empty; `false` otherwise.
   *  @param args     the children of this element
   */
  override def elem(
    pos: Int,
    pre: String,
    local: String,
    attrs: MetaData,
    scope: NamespaceBinding,
    empty: Boolean,
    nodes: NodeSeq): NodeSeq = {

    val nsURI = NS(scope.getURI(pre))
    val isFileRootNode = (local.equalsIgnoreCase("schema") && nsURI == XMLUtils.XSD_NAMESPACE) ||
      (local.equalsIgnoreCase("testSuite") && nsURI == XMLUtils.TDML_NAMESPACE)
    val alreadyHasLineCol = attrs.exists {
      case PrefixedAttribute(XMLUtils.INT_PREFIX, attr, _, _) => {
        attr.equalsIgnoreCase(XMLUtils.COLUMN_ATTRIBUTE_NAME) ||
          attr.equalsIgnoreCase(XMLUtils.LINE_ATTRIBUTE_NAME)
      }
      case _ => false
    }

    val newAttrs: MetaData = {
      if (addPositionAttributes && !alreadyHasLineCol) {
        val withFile: MetaData =
          if (isFileRootNode) {
            new PrefixedAttribute(XMLUtils.INT_PREFIX, XMLUtils.FILE_ATTRIBUTE_NAME, uri.toString, attrs)
          } else {
            attrs
          }
        val withCol: MetaData = new PrefixedAttribute(XMLUtils.INT_PREFIX, XMLUtils.COLUMN_ATTRIBUTE_NAME, Position.column(pos).toString, withFile)
        val withLine: MetaData = new PrefixedAttribute(XMLUtils.INT_PREFIX, XMLUtils.LINE_ATTRIBUTE_NAME, Position.line(pos).toString, withCol)
        withLine
      } else {
        attrs
      }
    }

    // add the dafint prefix if it doesn't already exist
    val intPrefix = scope.getPrefix(XMLUtils.INT_NS)
    val newScope = if (addPositionAttributes && intPrefix == null) {
      NamespaceBinding(XMLUtils.INT_PREFIX, XMLUtils.INT_NS, scope)
    } else {
      Assert.usage(intPrefix == null || intPrefix == XMLUtils.INT_PREFIX) // can't deal with some other binding for dafint
      scope
    }

    super.elem(pos, pre, local, newAttrs, newScope, empty, nodes)
  }

  /**
   * To emulate the behavior of Xerces loader (standard scala loader)
   * we have to normalize CRLF to LF, and solitary CR to LF.
   *
   * This is optional controlled by a constructor parameter.
   */
  override def text(pos: Int, txt: String): Text = {
    val newText:String = {
      if (normalizeCRLFtoLF && txt.contains("\r")) {
        txt.
          replaceAll("\r\n", "\n").
          replaceAll("\r", "\n")
      } else {
        txt
      }
    }
    //
    // On MS-Windows the TDML Runner previously would load XML
    // files and due to git autoCRLF=true, they would
    // have CRLFs in them. The loader the TDML Runner WAS
    // using (not any more) was preserving these CRLFs
    // in the XML infoset data, and so tests could come
    // to depend on this and be non-portable between
    // unix (LF only) and windows (CRLF only).
    //
    // Furthermore, the TDML file itself used to be loaded with this
    // CRLF-preserving loader.
    //
    // The TDML Runner now always normalizes CRLF or
    // isolated CR to LF like regular XML loaders do,
    // for both the TDML file itself, and any files it
    // loads. So this is no longer an issue.
    //
    super.text(pos, newText)
  }

  /**
   * We override this to force the ConstrutingParser to process CDATA regions
   * specially with an override-able method named cdata.
   *
   * Strangely, if you look at the implementation of this in the MarkupParser
   * trait, it calls the handler for text, but then it ignores the result of that
   * and constructs a PCDATA node from the original text.
   *
   * It's possible this is a bug fix.
   */
  override def xCharData: NodeSeq = {
    xToken("[CDATA[")
    def mkResult(pos: Int, s: String): NodeSeq = {
      val s1 = cdata(pos, s).text
      PCData(s1)
    }
    xTakeUntil(mkResult, () => pos, "]]>")
  }

  /**
   * Same CRLF/CR => LF processing as text gets.
   */
  def cdata(pos: Int, s: String): NodeSeq = {
    text(pos, s)
  }

  /**
   * Same CRLF/CR => LF processing as text gets.
   */
  override def comment(pos: Int, s: String): Comment = {
    Comment(text(pos, s).text)
  }

  /**
   * Same CRLF/CR => LF processing as text gets.
   */
  override def procInstr(pos: Int, target: String, txt: String) =
    ProcInstr(target, text(pos, txt).text)

  private def parseXMLPrologAttributes(m: MetaData): (Option[String], Option[String], Option[Boolean]) = {

    var info_ver: Option[String] = None
    var info_enc: Option[String] = None
    var info_stdl: Option[Boolean] = None

    var n = 0
    m("version") match {
      case null =>
      case Text("1.0") =>
        info_ver = Some("1.0"); n += 1
      case _ => reportSyntaxError("cannot deal with versions != 1.0")
    }

    m("encoding") match {
      case null =>
      case Text(enc) =>
        if (!isValidIANAEncoding(enc))
          reportSyntaxError("\"" + enc + "\" is not a valid encoding")
        else {
          info_enc = Some(enc)
          n += 1
        }
    }

    m("standalone") match {
      case null =>
      case Text("yes") =>
        info_stdl = Some(true); n += 1
      case Text("no") =>
        info_stdl = Some(false); n += 1
      case _ => reportSyntaxError("either 'yes' or 'no' expected")
    }

    if (m.length - n != 0) {
      reportSyntaxError(
        "only 'version', 'encoding', and 'standalone' attributes are expected in xml prolog. Found: " + m)
    }

    (info_ver, info_enc, info_stdl)
  }

  /**
   * Override of document to make it tolerant of the start of the file
   * being whitespace instead of a "<" character
   *
   * This does not handle DOCTYPEs (aka DTDs) at all. Hence, is not
   * a true replacement (bug fix) on the original ConstructingParser method
   * that it overrides.
   */
  override def document(): Document = {
    doc = new Document()
    this.dtd = null
    var children: NodeSeq = null

    if ('<' == ch) {
      nextch()
      if ('?' == ch) {
        nextch()
        // It's probably an XML prolog, but
        // there are cases where there is no XML Prolog, but a starting
        // PI of 
        // So we have to recognize as a general PI, then look and see if
        // it is a prolog.
        val name = xName
        xSpace()
        val (md, scp) = xAttributes(TopScope)
        if (scp != TopScope)
          reportSyntaxError("no xmlns definitions allowed.")
        xToken('?')
        xToken('>')
        if (name == "xml") {
          val info_prolog = parseXMLPrologAttributes(md)
          doc.version = info_prolog._1
          doc.encoding = info_prolog._2
          doc.standAlone = info_prolog._3
        } else {
          // not an xml prolog. It's some other PI
          // do nothing. We're just skipping those PIs
        }
        children = content(TopScope)
      } else {
        val ts = new NodeBuffer()
        content1(TopScope, ts) // the 1 suffix means "without the first < character"
        ts &+ content(TopScope)
        children = NodeSeq.fromSeq(ts)
      }
    } else {
      children = content(TopScope)
    }

    var isErr = false
    var elemCount = 0
    var theNode: Node = null
    children.foreach { c =>
      c match {
        case _: ProcInstr => // skip
        case _: Comment => // skip
        // $COVERAGE-OFF$ // constructing parser never creates these - probably due to a bug
        case _: EntityRef => {
          reportSyntaxError("no entity references allowed here")
          isErr = true
        }
        // $COVERAGE-ON$
        case s: SpecialNode => {
          val txt = s.toString.trim()
          if (txt.length > 0) {
            reportSyntaxError("non-empty text nodes not allowed: '" + txt + "'.")
            isErr = true
          }
        }
        case m: Elem =>
          elemCount += 1
          theNode = m
      }
    }
    if (1 != elemCount) {
      reportSyntaxError("document must contain exactly one element")
      isErr = true
    }

    if (!isErr) {
      doc.children = children
      doc.docElem = theNode
      doc
    } else {
      null
    }
  }

  def load(): Node = {
    val res =
      try {
        this.initialize
        val doc = this.document()
        if (doc == null) null
        else doc.docElem
      } catch {
        case e: Exception => {
          val exc = makeSAXParseException(curInput.pos, e.toString)
          errorHandler.fatalError(exc) // good place for a breakpoint
          null
        }
      }
    res
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy