io.shiftleft.utils.IOUtils.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of codepropertygraph_3 Show documentation
codepropertygraph
There is a newer version: 1.7.13
package io.shiftleft.utils

import java.io.Reader
import java.nio.charset.{CharsetDecoder, CodingErrorAction}
import java.nio.file.Path
import java.util.regex.Pattern
import scala.io.{BufferedSource, Codec, Source}
import scala.jdk.CollectionConverters.IteratorHasAsScala
import scala.util.Using

object IOUtils {

  private val surrogatePattern: Pattern = Pattern.compile("[^\u0000-\uffff]")

  private val boms: Set[Char] = Set(
    '\uefbb', // UTF-8
    '\ufeff', // UTF-16 (BE)
    '\ufffe'  // UTF-16 (LE)
  )

  /** Creates a new UTF-8 decoder. Sadly, instances of CharsetDecoder are not thread-safe as the doc states: 'Instances
    * of this class are not safe for use by multiple concurrent threads.' (copied from:
    * [[java.nio.charset.CharsetDecoder]])
    *
    * As we are using it in a [[io.shiftleft.passes.ForkJoinParallelCpgPass]] it needs to be thread-safe. Hence, we make
    * sure to create a new instance everytime.
    */
  private def createDecoder(): CharsetDecoder =
    Codec.UTF8.decoder
      .onMalformedInput(CodingErrorAction.REPLACE)
      .onUnmappableCharacter(CodingErrorAction.REPLACE)

  private def skipBOMIfPresent(reader: Reader): Unit = {
    reader.mark(1)
    val possibleBOM = new Array[Char](1)
    reader.read(possibleBOM)
    if (!boms.contains(possibleBOM(0))) {
      reader.reset()
    }
  }

  /** Java strings are stored as sequences of 16-bit chars, but what they represent is sequences of unicode characters.
    * In unicode terminology, they are stored as code units, but model code points. Thus, it's somewhat meaningless to
    * talk about removing surrogates, which don't exist in the character / code point representation (unless you have
    * rogue single surrogates, in which case you have other problems). Rather, what you want to do is to remove any
    * characters which will require surrogates when encoded. That means any character which lies beyond the basic
    * multilingual plane. You can do that with a simple regular expression.
    */
  private def replaceUnpairedSurrogates(input: String): String = {
    val matches = surrogatePattern.matcher(input)
    if (matches.find()) {
      val size = matches.end() - matches.start()
      matches.replaceAll("?" * size)
    } else {
      input
    }
  }

  private def contentFromBufferedSource(bufferedSource: BufferedSource): Seq[String] = {
    val reader = bufferedSource.bufferedReader()
    skipBOMIfPresent(reader)
    reader.lines().iterator().asScala.map(replaceUnpairedSurrogates).toSeq
  }

  private def contentStringFromBufferedSource(bufferedSource: BufferedSource): String = {
    val reader        = bufferedSource.bufferedReader()
    val stringBuilder = new StringBuilder
    val bufferSize    = 1024
    var productive    = true

    skipBOMIfPresent(reader)
    while (productive) {
      val buffer = new Array[Char](bufferSize)
      val read   = reader.read(buffer)
      productive = read > 0
      if (productive) {
        stringBuilder.appendAll(buffer, 0, read)
      }
    }

    replaceUnpairedSurrogates(stringBuilder.toString)
  }

  /** Reads a file at the given path and:
    *   - skips BOM if present
    *   - removes unpaired surrogates
    *   - uses UTF-8 encoding (replacing malformed and unmappable characters)
    *
    * @param path
    *   the file path
    * @return
    *   a Seq with all lines in the given file as Strings
    */
  def readLinesInFile(path: Path): Seq[String] =
    Using.resource(Source.fromFile(path.toFile)(createDecoder()))(contentFromBufferedSource)

  /** Reads a file at the given path and:
    *   - skips BOM if present
    *   - removes unpaired surrogates
    *   - uses UTF-8 encoding (replacing malformed and unmappable characters)
    *
    * @param path
    *   the file path
    * @return
    *   a String with the given file's contents
    */
  def readEntireFile(path: Path): String =
    Using.resource(Source.fromFile(path.toFile)(createDecoder()))(contentStringFromBufferedSource)

}