All Downloads are FREE. Search and download functionalities are using the official Maven repository.

scala.xml.Utility.scala Maven / Gradle / Ivy

The newest version!
/*
 * Scala (https://www.scala-lang.org)
 *
 * Copyright EPFL and Lightbend, Inc.
 *
 * Licensed under Apache License 2.0
 * (http://www.apache.org/licenses/LICENSE-2.0).
 *
 * See the NOTICE file distributed with this work for
 * additional information regarding copyright ownership.
 */

package scala
package xml

import scala.annotation.tailrec
import scala.collection.mutable
import scala.language.implicitConversions
import scala.collection.Seq

/**
 * The `Utility` object provides utility functions for processing instances
 * of bound and not bound XML classes, as well as escaping text nodes.
 *
 * @author Burak Emir
 */
object Utility extends AnyRef with parsing.TokenTests {
  final val SU: Char = '\u001A'

  // [Martin] This looks dubious. We don't convert StringBuilders to
  // Strings anywhere else, why do it here?
  implicit def implicitSbToString(sb: StringBuilder): String = sb.toString

  // helper for the extremely oft-repeated sequence of creating a
  // StringBuilder, passing it around, and then grabbing its String.
  private[xml] def sbToString(f: StringBuilder => Unit): String = {
    val sb: StringBuilder = new StringBuilder
    f(sb)
    sb.toString
  }
  private[xml] def isAtomAndNotText(x: Node): Boolean = x.isAtom && !x.isInstanceOf[Text]

  /**
   * Trims an element - call this method, when you know that it is an
   *  element (and not a text node) so you know that it will not be trimmed
   *  away. With this assumption, the function can return a `Node`, rather
   *  than a `Seq[Node]`. If you don't know, call `trimProper` and account
   *  for the fact that you may get back an empty sequence of nodes.
   *
   *  Precondition: node is not a text node (it might be trimmed)
   */
  def trim(x: Node): Node = x match {
    case Elem(pre, lab, md, scp, child@_*) =>
      val children: Seq[Node] = combineAdjacentTextNodes(child).flatMap(trimProper)
      Elem(pre, lab, md, scp, children.isEmpty, children: _*)
  }

  private def combineAdjacentTextNodes(children: Seq[Node]): Seq[Node] =
    children.foldRight(Seq.empty[Node]) {
      case (Text(left), Text(right) +: nodes) => Text(left + right) +: nodes
      case (n, nodes) => n +: nodes
    }

  /**
   * trim a child of an element. `Attribute` values and `Atom` nodes that
   *  are not `Text` nodes are unaffected.
   */
  def trimProper(x: Node): Seq[Node] = x match {
    case Elem(pre, lab, md, scp, child@_*) =>
      val children: Seq[Node] = combineAdjacentTextNodes(child).flatMap(trimProper)
      Elem(pre, lab, md, scp, children.isEmpty, children: _*)
    case Text(s) =>
      new TextBuffer().append(s).toText
    case _ =>
      x
  }

  /** returns a sorted attribute list */
  def sort(md: MetaData): MetaData = if (md.isNull || md.next.isNull) md else {
    val key: String = md.key
    val smaller: MetaData = sort(md.filter { m => m.key < key })
    val greater: MetaData = sort(md.filter { m => m.key > key })
    smaller.foldRight(md.copy(greater)) ((x, xs) => x.copy(xs))
  }

  /**
   * Return the node with its attribute list sorted alphabetically
   *  (prefixes are ignored)
   */
  def sort(n: Node): Node = n match {
    case Elem(pre, lab, md, scp, child@_*) =>
      val children: Seq[Node] = child.map(sort)
      Elem(pre, lab, sort(md), scp, children.isEmpty, children: _*)
    case _ => n
  }

  /**
   * Escapes the characters < > & and " from string.
   */
  final def escape(text: String): String = sbToString(escape(text, _))

  object Escapes {
    /**
     * For reasons unclear escape and unescape are a long ways from
     * being logical inverses.
     */
    val pairs: Map[String, Char] = Map(
      "lt" -> '<',
      "gt" -> '>',
      "amp" -> '&',
      "quot" -> '"',
      "apos"  -> '\''
    )
    val escMap: Map[Char, String] = (pairs - "apos").map { case (s, c) => c -> s"&$s;" }
    val unescMap: Map[String, Char] = pairs
  }
  import Escapes.{ escMap, unescMap }

  /**
   * Appends escaped string to `s`.
   */
  final def escape(text: String, s: StringBuilder): StringBuilder =
    // Implemented per XML spec:
    // http://www.w3.org/International/questions/qa-controls
    text.iterator.foldLeft(s) { (s, c) =>
      escMap.get(c) match {
        case Some(str)                             => s ++= str
        case _ if c >= ' ' || "\n\r\t".contains(c) => s += c
        case _ => s // noop
      }
    }

  /**
   * Appends unescaped string to `s`, `amp` becomes `&`,
   * `lt` becomes `<` etc..
   *
   * @return    `'''null'''` if `ref` was not a predefined entity.
   */
  final def unescape(ref: String, s: StringBuilder): StringBuilder =
    unescMap.get(ref).map(s.append).orNull

  /**
   * Returns a set of all namespaces used in a sequence of nodes
   * and all their descendants, including the empty namespaces.
   */
  def collectNamespaces(nodes: Seq[Node]): mutable.Set[String] =
    nodes.foldLeft(new mutable.HashSet[String]) { (set, x) => collectNamespaces(x, set); set }

  /**
   * Adds all namespaces in node to set.
   */
  def collectNamespaces(n: Node, set: mutable.Set[String]): Unit =
    if (n.doCollectNamespaces) {
      set += n.namespace
      for (a <- n.attributes) a match {
        case _: PrefixedAttribute =>
          set += a.getNamespace(n)
        case _ =>
      }
      for (i <- n.child)
        collectNamespaces(i, set)
    }

  // def toXML(
  //   x: Node,
  //   pscope: NamespaceBinding = TopScope,
  //   sb: StringBuilder = new StringBuilder,
  //   stripComments: Boolean = false,
  //   decodeEntities: Boolean = true,
  //   preserveWhitespace: Boolean = false,
  //   minimizeTags: Boolean = false): String =
  // {
  //   toXMLsb(x, pscope, sb, stripComments, decodeEntities, preserveWhitespace, minimizeTags)
  //   sb.toString
  // }

  /**
   * Serialize the provided Node to the provided StringBuilder.
   * 

* Note that calling this source-compatible method will result in the same old, arguably almost universally unwanted, * behaviour. */ @deprecated("Please use `serialize` instead and specify a `minimizeTags` parameter", "2.10.0") def toXML( x: Node, pscope: NamespaceBinding = TopScope, sb: StringBuilder = new StringBuilder, stripComments: Boolean = false, decodeEntities: Boolean = true, preserveWhitespace: Boolean = false, minimizeTags: Boolean = false ): StringBuilder = serialize(x, pscope, sb, stripComments, decodeEntities, preserveWhitespace, if (minimizeTags) MinimizeMode.Always else MinimizeMode.Never) /** * Serialize an XML Node to a StringBuilder. * * This is essentially a minor rework of `toXML` that can't have the same name due to an unfortunate * combination of named/default arguments and overloading. * * @todo use a Writer instead */ def serialize( x: Node, pscope: NamespaceBinding = TopScope, sb: StringBuilder = new StringBuilder, stripComments: Boolean = false, decodeEntities: Boolean = true, preserveWhitespace: Boolean = false, minimizeTags: MinimizeMode.Value = MinimizeMode.Default ): StringBuilder = { serializeImpl(List(x), pscope, false, stripComments, minimizeTags, sb) sb } private def serializeImpl( ns: Seq[Node], pscope: NamespaceBinding, spaced: Boolean, stripComments: Boolean, minimizeTags: MinimizeMode.Value, sb: StringBuilder ): Unit = { @tailrec def ser(nss: List[List[Node]], pscopes: List[NamespaceBinding], spaced: List[Boolean], toClose: List[Node]): Unit = nss match { case List(Nil) => case Nil :: rests => if (toClose.head != null) { sb.append("') } ser(rests, pscopes.tail, spaced.tail, toClose.tail) case (n :: ns) :: r => def sp(): Unit = if (ns.nonEmpty && spaced.head) sb.append(' ') n match { case c: Comment => if (!stripComments) { c.buildString(sb) sp() } ser(ns :: r, pscopes, spaced, toClose) case s: SpecialNode => s.buildString(sb) sp() ser(ns :: r, pscopes, spaced, toClose) case g: Group => ser(g.nodes.toList :: ns :: r, g.scope :: pscopes, false :: spaced, null :: toClose) case e: Elem => sb.append('<') e.nameToString(sb) if (e.attributes != null) e.attributes.buildString(sb) e.scope.buildString(sb, pscopes.head) if (e.child.isEmpty && (minimizeTags == MinimizeMode.Always || (minimizeTags == MinimizeMode.Default && e.minimizeEmpty))) { // no children, so use short form: sb.append("/>") sp() ser(ns :: r, pscopes, spaced, toClose) } else { sb.append('>') val csp = e.child.forall(isAtomAndNotText) ser(e.child.toList :: ns :: r, e.scope :: pscopes, csp :: spaced, e :: toClose) } case n => throw new IllegalArgumentException(s"Don't know how to serialize a ${n.getClass.getName}") } } ser(List(ns.toList), List(pscope), List(spaced), Nil) } def sequenceToXML( children: Seq[Node], pscope: NamespaceBinding = TopScope, sb: StringBuilder = new StringBuilder, stripComments: Boolean = false, decodeEntities: Boolean = true, preserveWhitespace: Boolean = false, minimizeTags: MinimizeMode.Value = MinimizeMode.Default ): Unit = if (children.nonEmpty) { val spaced = children.forall(isAtomAndNotText) serializeImpl(children, pscope, spaced, stripComments, minimizeTags, sb) } def splitName(name: String): (Option[String], String) = { val colon: Int = name.indexOf(':') if (colon < 0) (None, name) else (Some(name.take(colon)), name.drop(colon + 1)) } /** * Returns prefix of qualified name if any. */ final def prefix(name: String): Option[String] = splitName(name)._1 /** * Returns a hashcode for the given constituents of a node */ def hashCode(pre: String, label: String, attribHashCode: Int, scpeHash: Int, children: Seq[Node]): Int = scala.util.hashing.MurmurHash3.orderedHash(label +: attribHashCode +: scpeHash +: children, pre.##) def appendQuoted(s: String): String = sbToString(appendQuoted(s, _)) /** * Appends "s" if string `s` does not contain ", * 's' otherwise. */ def appendQuoted(s: String, sb: StringBuilder): StringBuilder = { val ch: Char = if (s.contains('"')) '\'' else '"' sb.append(s"$ch$s$ch") } /** * Appends "s" and escapes and " i s with \" */ def appendEscapedQuoted(s: String, sb: StringBuilder): StringBuilder = { sb.append('"') for (c <- s) c match { case '"' => sb.append('\\'); sb.append('"') case _ => sb.append(c) } sb.append('"') } def getName(s: String, index: Int): String = if (index >= s.length) null else { val xs: String = s.drop(index) if (xs.nonEmpty && isNameStart(xs.head)) xs.takeWhile(isNameChar) else "" } /** * Returns `'''null'''` if the value is a correct attribute value, * error message if it isn't. */ def checkAttributeValue(value: String): String = { var i: Int = 0 while (i < value.length) { value.charAt(i) match { case '<' => return "< not allowed in attribute value" case '&' => val n: String = getName(value, i + 1) if (n == null) return s"malformed entity reference in attribute value [$value]" i = i + n.length + 1 if (i >= value.length || value.charAt(i) != ';') return s"malformed entity reference in attribute value [$value]" case _ => } i = i + 1 } null } def parseAttributeValue(value: String): Seq[Node] = { val sb: StringBuilder = new StringBuilder var rfb: StringBuilder = null val nb: NodeBuffer = new NodeBuffer() val it: Iterator[Char] = value.iterator while (it.hasNext) { var c: Char = it.next() // entity! flush buffer into text node if (c == '&') { c = it.next() if (c == '#') { c = it.next() val theChar: String = parseCharRef ({ () => c }, { () => c = it.next() }, { s => throw new RuntimeException(s) }, { s => throw new RuntimeException(s) }) sb.append(theChar) } else { if (rfb == null) rfb = new StringBuilder() rfb.append(c) c = it.next() while (c != ';') { rfb.append(c) c = it.next() } val ref: String = rfb.toString rfb.clear() unescape(ref, sb) match { case null => if (sb.nonEmpty) { // flush buffer nb += Text(sb.toString) sb.clear() } nb += EntityRef(ref) // add entityref case _ => } } } else sb.append(c) } if (sb.nonEmpty) { // flush buffer val x: Text = Text(sb.toString) if (nb.isEmpty) return x else nb += x } nb } /** * {{{ * CharRef ::= "&#" '0'..'9' {'0'..'9'} ";" * | "&#x" '0'..'9'|'A'..'F'|'a'..'f' { hexdigit } ";" * }}} * See [66] */ def parseCharRef(ch: () => Char, nextch: () => Unit, reportSyntaxError: String => Unit, reportTruncatedError: String => Unit): String = { val hex: Boolean = (ch() == 'x') && { nextch(); true } val base: Int = if (hex) 16 else 10 var i: Int = 0 while (ch() != ';' && ch() != 0) { ch() match { case '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => i = i * base + ch().asDigit case 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F' => if (!hex) reportSyntaxError("hex char not allowed in decimal char ref\nDid you mean to write &#x ?") else i = i * base + ch().asDigit case SU => reportTruncatedError("") case _ => reportSyntaxError(s"character '${ch()}' not allowed in char ref\n") } nextch() } if (i != 0) new String(Array(i), 0, 1) else "" } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy