All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.daffodil.util.Misc.scala Maven / Gradle / Ivy

There is a newer version: 3.9.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.daffodil.util

import java.io.ByteArrayInputStream
import java.io.ByteArrayOutputStream
import java.io.File
import java.io.InputStream
import java.net.URI
import java.net.URL
import java.net.URLClassLoader
import java.nio.ByteBuffer
import java.nio.CharBuffer
import java.nio.channels.ReadableByteChannel
import java.nio.channels.WritableByteChannel
import java.nio.charset.CodingErrorAction
import java.nio.charset.{ Charset => JavaCharset }
import java.nio.file.Files
import java.nio.file.Paths

import scala.collection.JavaConverters._

import org.apache.daffodil.equality._

/**
 * Various reusable utilities that I couldn't easily find a better place for.
 */
object Misc {

  def boolToOpt[T](test: Boolean, thing: => T): Option[T] = {
    if (test) Some(thing) else None
  }

  def getNameFromClass(obj: Any): String = {
    if (obj == null) return "null"
    val nonPackageClassName = obj.getClass.getName.split("""\.""").toList.reverse.head
    val nonDollarsParts = nonPackageClassName.split("""\$""").toList.reverse
    val nonNumericParts = nonDollarsParts.filterNot { _.matches("""\d*""") }
    val nameToken = nonNumericParts.head
    nameToken
  }

  /**
   * Removes a suffix string, if it is found.
   */
  def stripSuffix(s: String, suffix: String) = {
    if (s.endsWith(suffix))
      s.substring(0, s.length - suffix.length)
    else s
  }

  def stripQuotes(s: String) = {
    val stripFirst = if (s.startsWith("\"")) s.substring(1) else s
    val stripLast = if (stripFirst.endsWith("\"")) stripFirst.substring(0, stripFirst.length - 1) else stripFirst
    stripLast
  }

  def isNullOrBlank(s: String) = {
    val b = Option(s).isEmpty || s.trim.isEmpty
    b
  }

  def isFileURI(uri: URI) = {
    if (uri.isAbsolute()) {
      val protocol = uri.toURL.getProtocol()
      val result = protocol == "file"
      result
    } else
      false
  }

  /**
   * Takes care of using the resource built-in to the jar, or
   * if we're just running interactively in eclipse or Intellij, doesn't use the jar.
   * But those put the same things on the classpath.
   *
   * resourcePath argument is relative to the classpath root.
   */
  def getResourceOption(resourcePathRaw: String): (Option[URI], String) = {
    val resourcePath = resourcePathRaw.replaceAll("""\s""", "%20")

    // more time is wasted by people forgetting that the initial "/" is needed
    // to get classpath relative behavior... Let's make sure there is a leading "/"
    val resPath = if (resourcePath.startsWith("/")) resourcePath else "/" + resourcePath
    val res = this.getClass().getResource(resPath)
    if (res == null) {
      (None, resPath)
    } else (Some(res.toURI), resPath)
  }

  /**
   * Gets a resource on the classpath, or relative to another URI
   */
  def getResourceRelativeOption(rawResName: String, optContextURI: Option[URI]): Option[URI] = {
    val resName = rawResName.replaceAll("""\s""", "%20")
    val (maybeRes, _) = Misc.getResourceOption(resName)
    if (maybeRes.isDefined) {
      maybeRes // found directly on the classpath.
    } else {
      optContextURI.flatMap { contextURI =>
        //
        // try relative to enclosing context uri
        //
        // Done using URL constructor because the URI.resolve(uri) method
        // doesn't work against so called opaque URIs, and jar URIs of the
        // sort we get here if the resource is in a jar, are opaque.
        // Some discussion of this issue is https://issues.apache.org/jira/browse/XMLSCHEMA-3
        //
        val contextURL = contextURI.toURL
        val completeURL = new URL(contextURL, resName)
        val res = tryURL(completeURL)
        res
      }
    }
  }

  /**
   * Search for a resource name, trying a handful of heuristics.
   *
   * This is useful in cases where it's okay to be a little lax when searching
   * for a resource name (e.g. testing). This is often needed in situations
   * like directory structures not exactly match paths and we just want to try
   * our best to find a file. Heuristics include things like ignoring leading
   * directories in a resource, looking for a resource as a file, etc. This
   * does mean that it's possible that this could find the wrong file if there
   * are ambiguities, so be careful when using.
   *
   * If ambiguities would be a serious problem, use getResourceOption or
   * getResourceRelativeOption.
   */
  def searchResourceOption(resName: String, relativeTo: Option[URI]): Option[URI] = {
    val resAsURI = new URI(resName)
    val resPath =
      if (resAsURI.getScheme != null) Paths.get(resAsURI) else Paths.get(resName)
    val resolvedURI =
      if (Files.exists(resPath)) Some(resPath.toFile().toURI())
      else Misc.getResourceRelativeOption(resName, relativeTo)
    val res = resolvedURI.orElse {
      // try ignoring the directory part
      val parts = resName.split("/")
      if (parts.length > 1) { // if there is one
        val filePart = parts.last
        val secondTry = searchResourceOption(filePart, relativeTo) // recursively
        secondTry
      } else {
        None
      }
    }
    res
  }

  private def tryURL(url: URL): Option[URI] = {
    var is: InputStream = null
    val res =
      try {
        is = url.openStream()
        // worked! We found it.
        Some(url.toURI)
      } catch {
        case e: java.io.IOException => None
      } finally {
        if (is != null) is.close()
      }
    res
  }

  lazy val classPath = {
    val cl = this.getClass().getClassLoader()
    val urls = cl match {
      case url: URLClassLoader => url.getURLs().toSeq
      case _ => Seq.empty
    }
    urls
  }

  def getRequiredResource(resourcePath: String): URI = {
    getResourceOption(resourcePath) match {
      case (None, resPath) => {
        val msg = "Required resource " + resPath + " was not found.\nClasspath is " +
          (if (classPath.length == 0) "unknown."
          else ": " + classPath.mkString("\n"))
        throw new java.io.FileNotFoundException(msg)
      }
      case (Some(res), _) => res
    }
  }

  def initialUpperCase(s: String): String = s.head.toUpper + s.substring(1)
  def initialLowerCase(s: String): String = s.head.toLower + s.substring(1)

  /**
   * Convert FooBar to fooBar, but leave FOOBAR as FOOBAR.
   */
  def toInitialLowerCaseUnlessAllUpperCase(s: String): String = {
    if (s(0).isLower) return s
    //
    // At this point we know the first letter is uppercase
    //
    if (isAllUpper(s, 1)) s
    else s(0).toLower + s.substring(1)
  }

  def isAllUpper(s: String, start: Int): Boolean = {
    var i = start
    val l = s.length
    while (i < l) {
      if (s(i).isLower) return false
      i += 1
    }
    true
  }

  /**
   * Returns true if a StringBuilder contains all whitespace
   */
  def isAllWhitespace(sb: StringBuilder): Boolean = {
    if (sb.isEmpty) false
    else {
      var in = 0
      val sbLen = sb.length
      while (in < sbLen) {
        if (!sb.charAt(in).isWhitespace) return false
        in += 1
      }
      true
    }
  }

  /**
   * Returns the primary version of daffodil from the jar
   */
  def getDaffodilVersion: String = {
    val implVersion = this.getClass.getPackage.getImplementationVersion
    if (implVersion == null) {
      ""
    } else {
      implVersion
    }
  }

  /**
   * Requires an even number of nibbles.
   */
  def hex2Bytes(hex: String): Array[Byte] = {
    // This function originally just looked like this:
    //
    //   hex.sliding(2, 2).toArray.map { Integer.parseInt(_, 16).toByte }
    //
    // Although that is very clear and simple, it is actually very slow. The
    // code below, which accomplishes the same thing, is about 2 orders of
    // magnitude faster. We're leaving the above comment in place to more
    // clearly show what this function is doing, but keeping this below code
    // due to its speed, which is very important for schemas containing
    // xs:hexBinary types.

    def hexCharToValue(c: Char): Int = {
      val i = c.toInt
      val v =
        if (i >= 48 && i <= 57) i - 48 // number 0-9
        else if (i >= 65 && i <= 70) (i - 65) + 10 // capital A-F
        else if (i >= 97 && i <= 102) (i - 97) + 10 // lowercase a-f
        else throw new java.lang.IllegalArgumentException("Hex character must be 0-9, a-z, or A-Z, but was '" + c + "'")
      v
    }

    val len = hex.length
    if (len % 2 != 0) {
      throw new java.lang.IllegalArgumentException("Hex string must have an even number of characters, but was " + len)
    }
    val numBytes: Int = len / 2
    val arr = new Array[Byte](numBytes)
    var arrIdx: Int = 0
    var hexIdx: Int = 0
    while (arrIdx < numBytes) {
      val l = hexCharToValue(hex(hexIdx))
      val r = hexCharToValue(hex(hexIdx + 1))
      arr(arrIdx) = ((l << 4) | r).toByte
      arrIdx += 1
      hexIdx += 2
    }
    arr
  }

  def hex2Bits(hex: String): String = {
    val nums = hex.map { ch => Integer.parseInt(ch.toString, 16) }
    val bits = nums map { java.lang.Long.toString(_, 2) }
    val paddedBits = bits map { "%4s".format(_).replaceAll(" ", "0") }
    val res = paddedBits.mkString
    res
  }

  private val hexLookup = "0123456789ABCDEF".toArray

  def bytes2Hex(bytes: Array[Byte]): String = {
    val hexArr = new Array[Char](bytes.length * 2)
    var bytIdx = 0
    var hexIdx = 0
    while (bytIdx < bytes.length) {
      val b = bytes(bytIdx) & 0xFF
      hexArr(hexIdx) = hexLookup(b >>> 4)
      hexArr(hexIdx + 1) = hexLookup(b & 0x0F)
      bytIdx += 1
      hexIdx += 2
    }
    new String(hexArr)
  }

  def bits2Bytes(bits: String): Array[Byte] =
    if (bits.isEmpty()) Nil.toArray
    else bits2Bytes(Seq(bits))

  def bits2Bytes(bits: Seq[String]): Array[Byte] = {
    // split at any character not a 0 or 1, then concatenate all
    val allBitsOnly = bits.flatMap { _.split("[^01]") }.mkString
    val byteSizedBits = allBitsOnly.sliding(8, 8)
    byteSizedBits.map(Integer.parseInt(_, 2).toByte).toArray
  }

  def bytes2Bits(bytes: Array[Byte]): String = {
    bytes.map { b => (b & 0xFF).toBinaryString.reverse.padTo(8, '0').reverse }.mkString
  }

  // Moved here from Compiler object.

  def stringToReadableByteChannel(s: String): ReadableByteChannel = {
    val bytes = s.getBytes("utf-8") // never use default charset. NEVER.
    byteArrayToReadableByteChannel(bytes)
  }

  def stringToWritableByteChannel(s: String): WritableByteChannel = {
    val size = s.length() // TODO: get byte count by encoding
    byteArrayToWritableByteChannel(size)
  }

  def byteArrayToReadableByteChannel(bytes: Array[Byte]): ReadableByteChannel = {
    val inputStream = new ByteArrayInputStream(bytes);
    val rbc = java.nio.channels.Channels.newChannel(inputStream);
    rbc
  }

  def byteArrayToWritableByteChannel(size: Int): WritableByteChannel = {
    val outputStream = new ByteArrayOutputStream(size);
    val wbc = java.nio.channels.Channels.newChannel(outputStream);
    wbc
  }

  def fileToReadableByteChannel(file: java.io.File): ReadableByteChannel = {
    val inputStream = new java.io.FileInputStream(file)
    val rbc = java.nio.channels.Channels.newChannel(inputStream);
    rbc
  }

  /**
   * This function creates a representation of data which doesn't
   * contain any whitespace characters that jump around the screen.
   * It replaces those with characters that have a simple glyph.
   *
   * The point of this is when you display the stream of data for
   * debugging, or for a diagnostic message,
   * the characters which control position like CR, LF, FF,
   * VT, HT, BS, etc. all make it hard to figure out what is going on.
   * Replacing these with the picture characters (designed for this purpose)
   * in the unicode x2400 block helps.
   */
  def remapControlsAndLineEndingsToVisibleGlyphs(s: String) = {
    s.map { remapControlOrLineEndingToVisibleGlyphs(_) }.mkString
  }

  def remapControlOrLineEndingToVisibleGlyphs(c: Char) = {
    val URC = 0x2426 // Unicode control picture character for substutition (also looks like arabic q-mark)
    val code = c.toInt match {
      //
      // C0 Control pictures
      case n if (n <= 0x1F) => n + 0x2400
      case 0x20 => 0x2423 // For space we use the SP we use the ␣ (Unicode OPEN BOX)
      case 0x7F => 0x2421 // DEL pic isn't at 0x247F, it's at 0x2421
      //
      // Unicode separators & joiners
      case 0x00A0 => URC // no-break space
      case 0x200B => URC // zero width space
      case 0x2028 => URC // line separator
      case 0x2029 => URC // paragraph separator
      case 0x200C => URC // zero width non-joiner
      case 0x200D => URC // zero width joiner
      case 0x2060 => URC // word joiner
      // bi-di controls
      case 0x200E | 0x200F => URC
      case b if (b >= 0x202A && b <= 0x202E) => URC
      // byte order mark
      case 0xFFFE => URC // ZWNBS aka Byte Order Mark
      case 0xFFFF => URC // non-character FFFF
      // we assume surrogate codepoints all have a glyph (depends on font used of course)
      //
      // TODO: this could go on and on. There's a flock of 'space' characters (EM SPACE)
      // all over the place in Unicode.
      //
      // TODO: combining characters,
      // all whitespace, zero-width, and combining/joining characters would be
      // represented by a separate glyph-character.
      //
      // Probably could be done by checking the character against some
      // unicode regex character classes like \p{M} which is the class
      // of combining mark characters
      //
      //
      // Special case - if incoming character is one of the glyph
      // characters we're remapping onto, then change to URC
      //
      case n if (n > 0x2400 && n < 0x2423) => URC
      case _ => c
    }
    code.toChar
  }

  private val bytesCharset = JavaCharset.forName("windows-1252") // same as iso-8859-1 but has a few more glyphs.
  private val bytesDecoder = {
    val decoder = bytesCharset.newDecoder()
    decoder.onMalformedInput(CodingErrorAction.REPLACE)
    decoder.onUnmappableCharacter(CodingErrorAction.REPLACE)
    decoder
  }

  /**
   * Used when creating a debugging dump of data, where the data might be binary stuff
   * but we want to show some sort of glyph for each byte.
   *
   * This uses windows-1252 for all the places it has glyphs, and other unicode
   * glyph characters to replace the control characters and unused characters.
   *
   * This allows printing a data dump to the screen, without worry that the control
   * characters will ring bells or cause the text to jump around, and unmapped
   * characters will not look like spaces, nor all look like the same unicde replacement
   * character.
   */
  def remapBytesToVisibleGlyphs(bb: ByteBuffer, cb: CharBuffer): Unit = {
    val numBytes = bb.remaining()
    bytesDecoder.decode(bb, cb, true)
    cb.flip
    var i = 0
    while (i < numBytes) {
      val newCodepoint = remapOneByteToVisibleGlyph(bb.get(i))
      if (newCodepoint != -1) {
        cb.put(i, newCodepoint.toChar)
      }
      i += 1
    }
  }

  /**
   * For unicode codepoints in the range 0 to 255, or signed -128 to 127,
   * make sure there is a visible glyph.
   */
  def remapCodepointToVisibleGlyph(codepoint: Int): Int = {
    if (codepoint > 255 || codepoint < -128) return codepoint
    val b = codepoint.toByte
    val r = remapOneByteToVisibleGlyph(b)
    if (r == -1) codepoint else r
  }

  def remapStringToVisibleGlyphs(s: String) = {
    s.map { c => remapCodepointToVisibleGlyph(c.toInt).toChar }
  }

  def remapBytesToStringOfVisibleGlyphs(ba: Array[Byte]): String = {
    ba.map { b => remapCodepointToVisibleGlyph(b.toInt).toChar }.mkString
  }

  def remapByteToVisibleGlyph(b: Byte): Int = {
    val bb = ByteBuffer.allocate(1)
    bb.put(0, b)
    val cb = CharBuffer.allocate(1)
    remapBytesToVisibleGlyphs(bb, cb)
    cb.get(0).toChar.toInt
  }

  /**
   * Remaps a byte to a unicode codepoint for a visible picture, or -1 if
   * no remapping is needed.
   *
   * A difficulty is that there do not seem to be generally available Unicode fonts
   * which are truly monospaced for every Unicode character. So since we are
   * trying to produce data dumps that are monospaced, the tabular layout is off a bit.
   *
   * Even if there was such a font, it wouldn't be the default font.
   *
   * Courier New seems to work well. It is monospaced for every character we use
   * in this remap stuff. But not for the "double wide" Kanji or other wide oriental
   * characters.
   */
  private def remapOneByteToVisibleGlyph(b: Byte): Int = {
    Bits.asUnsignedByte(b) match {
      //
      // replace C0 controls with unicode control pictures
      //
      case n if (n <= 0x1F) => n + 0x2400
      //
      // replace space and DEL with control pictures
      //
      case 0x20 => 0x2423 // For space we use the SP we use the ␣ (Unicode OPEN BOX)
      case 0x7F => 0x2421 // DEL pic isn't at 0x247F, it's at 0x2421
      //
      // replace undefined characters in the C1 control space with
      // glyph characters. These are the only codepoints in the C1
      // space which do not have a glyph defined by windows-1252
      //
      // We remap these into the Unicode Latin Extended B codepoints by
      // adding 0x100 to their basic value.
      //
      case 0x81 => 0x0181
      case 0x8D => 0x018d
      case 0x8F => 0x018F
      case 0x90 => 0x0190
      case 0x9D => 0x019D
      //
      // Non-break space
      //
      case 0xA0 => 0x2422 // little b with stroke
      case 0xAD => 0x002D // soft hyphen becomes hyphen
      case regular => -1 // all other cases -1 means we just use the regular character glyph.
    }
  }

  /**
   * True if this charset encoding is suitable for display using the
   * all-visible-glyph stuff above.
   */
  def isAsciiBased(csName: String): Boolean = isAsciiBased(JavaCharset.forName(csName))

  def isAsciiBased(cs: JavaCharset): Boolean = {
    val aliases: Seq[String] = cs.aliases().asScala.toSeq.map { _.toUpperCase }
    val byName =
      aliases.exists { s =>
        !(s.contains("7-BIT")) &&
          !(s.contains("EBCDIC")) && (
            s.startsWith("ASCII") ||
            s.startsWith("US-ASCII") ||
            s.startsWith("ISO-8859") ||
            s.startsWith("UTF"))
      }
    if (byName) byName
    else {
      val decoder = cs.newDecoder()
      decoder.onMalformedInput(CodingErrorAction.REPLACE)
      decoder.onUnmappableCharacter(CodingErrorAction.REPLACE)
      val abcBytes = "abc".getBytes("ascii")
      val bb = ByteBuffer.wrap(abcBytes)
      val cb = decoder.decode(bb)
      val abc = cb.toString()
      if (abc =:= "abc") true
      else false
    }
  }

  import scala.language.reflectiveCalls // scala 2.10 creates warning unless we have this.
  /**
   * Convenient I/O tools
   */
  def using[A <: { def close(): Unit }, B](param: A)(f: A => B): B =
    try { f(param) } finally { param.close() }

  /**
   * convenience method
   */
  def writeToFile(fileName: String)(body: java.io.Writer => Unit): Unit = {
    val writer = new java.io.FileWriter(fileName)
    using(writer) { body(_) }
  }

  /**
   * Detects the encoding of the File for us.
   *
   * This was needed for the ConstructingParser
   * in order to read in xml from a file.
   */
  def determineEncoding(uri: URI): String = {
    val encH = scala.xml.include.sax.EncodingHeuristics
    val is = uri.toURL.openStream()
    val bis = new java.io.BufferedInputStream(is)
    val enc = encH.readEncodingFromStream(bis)
    is.close()
    enc
  }

  def determineEncoding(file: File): String = determineEncoding(file.toURI)

  /**
   * CharSequence to String
   */
  def csToString(cs: CharSequence): String = {
    cs match {
      case s: String => s
      case _ => {
        val sb = new StringBuilder(cs.length())
        sb.append(cs)
        sb.mkString
      }
    }
  }

  /**
   * Java throwable/exception objects may or may not have a message. They are supposed to have a cause if they
   * don't have a message of their own, but might have neither, or might have both.
   *
   * This is too painful to deal with in code when you want to be generic about converting throws/exceptions
   * into diagnostic information.
   *
   * So we have a more uniform behavior. Never returns null. Always gets a message.
   * If the argument has none, but has a cause object, then it
   * gets the message from that, if that has no message, it chases further.
   * Ultimately, if there's no message, it just uses the innermost cause object's class name.
   */

  def getSomeMessage(th: Throwable): Some[String] = {
    val m = th.getMessage()
    val c = th.getCause()
    val res = (m, c) match {
      case (null, null) => Misc.getNameFromClass(th)
      case ("", null) => Misc.getNameFromClass(th)
      case (m, null) => m
      case (null, c) => getSomeMessage(c).get
      case (m, c) => {
        val Some(cmsg) = getSomeMessage(c)
        cmsg + " (within " + m + ")"
      }
    }
    Some(res)
  }

  def getSomeCause(th: Throwable): Some[Throwable] = {
    val c = th.getCause()
    val res = c match {
      case null => th
      case _ => getSomeCause(c).get
    }
    Some(res)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy