com.johnsnowlabs.nlp.util.io.ResourceHelper.scala Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.util.io

import com.amazonaws.AmazonServiceException
import com.johnsnowlabs.client.CloudResources
import com.johnsnowlabs.client.aws.AWSGateway
import com.johnsnowlabs.client.util.CloudHelper
import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp.annotators.common.{TaggedSentence, TaggedWord}
import com.johnsnowlabs.nlp.util.io.ReadAs._
import com.johnsnowlabs.nlp.{DocumentAssembler, Finisher}
import com.johnsnowlabs.util.ConfigHelper
import org.apache.commons.io.FileUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

import java.io._
import java.net.{URI, URL, URLDecoder}
import java.nio.file
import java.nio.file.{Files, Paths}
import java.util.jar.JarFile
import scala.collection.mutable.{ArrayBuffer, Map => MMap}
import scala.io.BufferedSource
import scala.util.{Failure, Success, Try}

/** Helper one-place for IO management. Streams, source and external input should be handled from
  * here
  */
object ResourceHelper {

  def getActiveSparkSession: SparkSession =
    SparkSession.getActiveSession.getOrElse(
      SparkSession
        .builder()
        .appName("SparkNLP Default Session")
        .master("local[*]")
        .config("spark.driver.memory", "22G")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        .config("spark.kryoserializer.buffer.max", "1000m")
        .getOrCreate())

  def getSparkSessionWithS3(
      awsAccessKeyId: String,
      awsSecretAccessKey: String,
      hadoopAwsVersion: String = ConfigHelper.hadoopAwsVersion,
      AwsJavaSdkVersion: String = ConfigHelper.awsJavaSdkVersion,
      region: String = "us-east-1",
      s3Impl: String = "org.apache.hadoop.fs.s3a.S3AFileSystem",
      pathStyleAccess: Boolean = true,
      credentialsProvider: String = "TemporaryAWSCredentialsProvider",
      awsSessionToken: Option[String] = None): SparkSession = {

    require(
      SparkSession.getActiveSession.isEmpty,
      "Spark session already running, can't apply new configuration for S3.")

    val sparkSession = SparkSession
      .builder()
      .appName("SparkNLP Session with S3 Support")
      .master("local[*]")
      .config("spark.driver.memory", "22G")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.kryoserializer.buffer.max", "1000M")
      .config("spark.driver.maxResultSize", "0")
      .config("spark.hadoop.fs.s3a.access.key", awsAccessKeyId)
      .config("spark.hadoop.fs.s3a.secret.key", awsSecretAccessKey)
      .config(ConfigHelper.awsExternalRegion, region)
      .config(
        "spark.hadoop.fs.s3a.aws.credentials.provider",
        s"org.apache.hadoop.fs.s3a.$credentialsProvider")
      .config("spark.hadoop.fs.s3a.impl", s3Impl)
      .config(
        "spark.jars.packages",
        "org.apache.hadoop:hadoop-aws:" + hadoopAwsVersion + ",com.amazonaws:aws-java-sdk:" + AwsJavaSdkVersion)
      .config("spark.hadoop.fs.s3a.path.style.access", pathStyleAccess.toString)

    if (credentialsProvider == "TemporaryAWSCredentialsProvider") {
      require(
        awsSessionToken.isDefined,
        "AWS Session token needs to be provided for TemporaryAWSCredentialsProvider.")
      sparkSession.config("spark.hadoop.fs.s3a.session.token", awsSessionToken.get)
    }

    sparkSession.getOrCreate()
  }

  lazy val spark: SparkSession = getActiveSparkSession

  /** Structure for a SourceStream coming from compiled content */
  case class SourceStream(resource: String) {

    var fileSystem: Option[FileSystem] = None
    private val (pathExists: Boolean, path: Option[Path]) = OutputHelper.doesPathExists(resource)
    if (!pathExists) {
      throw new FileNotFoundException(s"file or folder: $resource not found")
    } else {
      fileSystem = Some(OutputHelper.getFileSystem(resource))
    }

    val pipe: Seq[InputStream] = getPipe(fileSystem.get)
    private val openBuffers: Seq[BufferedSource] = pipe.map(pp => {
      new BufferedSource(pp)("UTF-8")
    })
    val content: Seq[Iterator[String]] = openBuffers.map(c => c.getLines())

    private def getPipe(fileSystem: FileSystem): Seq[InputStream] = {
      if (fileSystem.getScheme == "s3a") {
        val awsGateway = new AWSGateway()
        val (bucket, s3Path) = CloudHelper.parseS3URI(path.get.toString)
        val inputStreams = awsGateway.listS3Files(bucket, s3Path).map { summary =>
          val s3Object = awsGateway.getS3Object(bucket, summary.getKey)
          s3Object.getObjectContent
        }
        inputStreams
      } else {
        val files = fileSystem.listFiles(path.get, true)
        val buffer = ArrayBuffer.empty[InputStream]
        while (files.hasNext) buffer.append(fileSystem.open(files.next().getPath))
        buffer
      }
    }

    /** Copies the resource into a local temporary folder and returns the folders URI.
      *
      * @param prefix
      *   Prefix for the temporary folder.
      * @return
      *   URI of the created temporary folder with the resource
      */
    def copyToLocal(prefix: String = "sparknlp_tmp_"): URI = {
      if (fileSystem.get.getScheme == "file")
        return URI.create(resource)

      val destination: file.Path = Files.createTempDirectory(prefix)

      val destinationUri = fileSystem.get.getScheme match {
        case "hdfs" =>
          fileSystem.get.copyToLocalFile(false, path.get, new Path(destination.toUri), true)
          if (fileSystem.get.getFileStatus(path.get).isDirectory)
            Paths.get(destination.toString, path.get.getName).toUri
          else destination.toUri
        case "dbfs" =>
          val dbfsPath = path.get.toString.replace("dbfs:/", "/dbfs/")
          val sourceFile = new File(dbfsPath)
          val targetFile = new File(destination.toString)
          if (sourceFile.isFile) FileUtils.copyFileToDirectory(sourceFile, targetFile)
          else FileUtils.copyDirectory(sourceFile, targetFile)
          targetFile.toURI
        case _ =>
          val files = fileSystem.get.listFiles(path.get, false)
          while (files.hasNext) {
            fileSystem.get.copyFromLocalFile(files.next.getPath, new Path(destination.toUri))
          }
          destination.toUri
      }

      destinationUri
    }

    def close(): Unit = {
      openBuffers.foreach(_.close())
      pipe.foreach(_.close)
    }
  }

  private def fixTarget(path: String): String = {
    val toSearch =
      s"^.*target\\${File.separator}.*scala-.*\\${File.separator}.*classes\\${File.separator}"
    if (path.matches(toSearch + ".*")) {
      path.replaceFirst(toSearch, "")
    } else {
      path
    }
  }

  /** Copies the remote resource to a local temporary folder and returns its absolute path.
    *
    * Currently, file:/, s3:/, hdfs:/ and dbfs:/ are supported.
    *
    * If the file is already on the local file system just the absolute path will be returned
    * instead.
    * @param path
    *   Path to the resource
    * @return
    *   Absolute path to the temporary or local folder of the resource
    */
  def copyToLocal(path: String): String = try {
    val localUri =
      if (CloudHelper.isCloudPath(path)) { // Download directly from Cloud Buckets
        CloudResources.downloadBucketToLocalTmp(path)
      } else { // Use Source Stream
        val pathWithProtocol: String =
          if (URI.create(path).getScheme == null) new File(path).toURI.toURL.toString else path
        val resource = SourceStream(pathWithProtocol)
        resource.copyToLocal()
      }

    new File(localUri).getAbsolutePath // Platform independent path
  } catch {
    case awsE: AmazonServiceException =>
      println("Error while retrieving folder from S3. Make sure you have set the right " +
        "access keys with proper permissions in your configuration. For an example please see " +
        "https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/mfa_ner_graphs_s3.ipynb")
      throw awsE
    case e: Exception =>
      val copyToLocalErrorMessage: String =
        "Please make sure the provided path exists and is accessible while keeping in mind only file:/, hdfs:/, dbfs:/ and s3:/ protocols are supported at the moment."
      println(
        s"$e \n Therefore, could not create temporary local directory for provided path $path. $copyToLocalErrorMessage")
      throw e
  }

  /** NOT thread safe. Do not call from executors. */
  def getResourceStream(path: String): InputStream = {
    if (new File(path).exists())
      new FileInputStream(new File(path))
    else {
      Option(getClass.getResourceAsStream(path))
        .getOrElse {
          Option(getClass.getClassLoader.getResourceAsStream(path))
            .getOrElse(throw new IllegalArgumentException(f"Wrong resource path $path"))
        }
    }
  }

  def getResourceFile(path: String): URL = {
    var dirURL = getClass.getResource(path)

    if (dirURL == null)
      dirURL = getClass.getClassLoader.getResource(path)

    dirURL
  }

  def listResourceDirectory(path: String): Seq[String] = {
    val dirURL = getResourceFile(path)

    if (dirURL != null && dirURL.getProtocol.equals("file") && new File(dirURL.toURI).exists()) {
      /* A file path: easy enough */
      return new File(dirURL.toURI).listFiles.sorted.map(_.getPath).map(fixTarget)
    } else if (dirURL == null) {
      /* path not in resources and not in disk */
      throw new FileNotFoundException(path)
    }

    if (dirURL.getProtocol.equals("jar")) {
      /* A JAR path */
      val jarPath =
        dirURL.getPath.substring(5, dirURL.getPath.indexOf("!")) // strip out only the JAR file
      val jar = new JarFile(URLDecoder.decode(jarPath, "UTF-8"))
      val entries = jar.entries()
      val result = new ArrayBuffer[String]()

      val pathToCheck = path
        .stripPrefix(File.separator.replaceAllLiterally("\\", "/"))
        .stripSuffix(File.separator) +
        File.separator.replaceAllLiterally("\\", "/")

      while (entries.hasMoreElements) {
        val name = entries.nextElement().getName.stripPrefix(File.separator)
        if (name.startsWith(pathToCheck)) { // filter according to the path
          var entry = name.substring(pathToCheck.length())
          val checkSubdir = entry.indexOf("/")
          if (checkSubdir >= 0) {
            // if it is a subdirectory, we just return the directory name
            entry = entry.substring(0, checkSubdir)
          }
          if (entry.nonEmpty) {
            result.append(pathToCheck + entry)
          }
        }
      }
      return result.distinct.sorted
    }

    throw new UnsupportedOperationException(s"Cannot list files for URL $dirURL")
  }

  /** General purpose key value parser from source Currently read only text files
    *
    * @return
    */
  def parseKeyValueText(er: ExternalResource): Map[String, String] = {
    er.readAs match {
      case TEXT =>
        val sourceStream = SourceStream(er.path)
        val res = sourceStream.content
          .flatMap(c =>
            c.map(line => {
              val kv = line.split(er.options("delimiter"))
              (kv.head.trim, kv.last.trim)
            }))
          .toMap
        sourceStream.close()
        res
      case SPARK =>
        import spark.implicits._
        val dataset = spark.read
          .options(er.options)
          .format(er.options("format"))
          .options(er.options)
          .option("delimiter", er.options("delimiter"))
          .load(er.path)
          .toDF("key", "value")
        val keyValueStore = MMap.empty[String, String]
        dataset.as[(String, String)].foreach { kv =>
          keyValueStore(kv._1) = kv._2
        }
        keyValueStore.toMap
      case _ =>
        throw new Exception("Unsupported readAs")
    }
  }

  def parseKeyListValues(externalResource: ExternalResource): Map[String, List[String]] = {
    externalResource.readAs match {
      case TEXT =>
        val sourceStream = SourceStream(externalResource.path)
        val keyValueStore = MMap.empty[String, List[String]]
        sourceStream.content.foreach(content =>
          content.foreach { line =>
            {
              val keyValues = line.split(externalResource.options("delimiter"))
              val key = keyValues.head
              val value = keyValues.drop(1).toList
              val storedValue = keyValueStore.get(key)
              if (storedValue.isDefined && !storedValue.contains(value)) {
                keyValueStore.update(key, storedValue.get ++ value)
              } else keyValueStore(key) = value
            }
          })
        sourceStream.close()
        keyValueStore.toMap
    }
  }

  def parseKeyArrayValues(externalResource: ExternalResource): Map[String, Array[Float]] = {
    externalResource.readAs match {
      case TEXT =>
        val sourceStream = SourceStream(externalResource.path)
        val keyValueStore = MMap.empty[String, Array[Float]]
        sourceStream.content.foreach(content =>
          content.foreach { line =>
            {
              val keyValues = line.split(externalResource.options("delimiter"))
              val key = keyValues.head
              val value = keyValues.drop(1).map(x => x.toFloat)
              if (value.length > 1) {
                keyValueStore(key) = value
              }
            }
          })
        sourceStream.close()
        keyValueStore.toMap
    }
  }

  /** General purpose line parser from source Currently read only text files
    *
    * @return
    */
  def parseLines(er: ExternalResource): Array[String] = {
    er.readAs match {
      case TEXT =>
        val sourceStream = SourceStream(er.path)
        val res = sourceStream.content.flatten.toArray
        sourceStream.close()
        res
      case SPARK =>
        import spark.implicits._
        spark.read
          .options(er.options)
          .format(er.options("format"))
          .load(er.path)
          .as[String]
          .collect
      case _ =>
        throw new Exception("Unsupported readAs")
    }
  }

  /** General purpose line parser from source Currently read only text files
    *
    * @return
    */
  def parseLinesIterator(er: ExternalResource): Seq[Iterator[String]] = {
    er.readAs match {
      case TEXT =>
        val sourceStream = SourceStream(er.path)
        sourceStream.content
      case _ =>
        throw new Exception("Unsupported readAs")
    }
  }

  /** General purpose tuple parser from source Currently read only text files
    *
    * @return
    */
  def parseTupleText(er: ExternalResource): Array[(String, String)] = {
    er.readAs match {
      case TEXT =>
        val sourceStream = SourceStream(er.path)
        val res = sourceStream.content
          .flatMap(c =>
            c.filter(_.nonEmpty)
              .map(line => {
                val kv = line.split(er.options("delimiter")).map(_.trim)
                (kv.head, kv.last)
              }))
          .toArray
        sourceStream.close()
        res
      case SPARK =>
        import spark.implicits._
        val dataset = spark.read.options(er.options).format(er.options("format")).load(er.path)
        val lineStore = spark.sparkContext.collectionAccumulator[String]
        dataset.as[String].foreach(l => lineStore.add(l))
        val result = lineStore.value.toArray.map(line => {
          val kv = line.toString.split(er.options("delimiter")).map(_.trim)
          (kv.head, kv.last)
        })
        lineStore.reset()
        result
      case _ =>
        throw new Exception("Unsupported readAs")
    }
  }

  /** General purpose tuple parser from source Currently read only text files
    *
    * @return
    */
  def parseTupleSentences(er: ExternalResource): Array[TaggedSentence] = {
    er.readAs match {
      case TEXT =>
        val sourceStream = SourceStream(er.path)
        val result = sourceStream.content
          .flatMap(c =>
            c.filter(_.nonEmpty)
              .map(line => {
                line
                  .split("\\s+")
                  .filter(kv => {
                    val s = kv.split(er.options("delimiter").head)
                    s.length == 2 && s(0).nonEmpty && s(1).nonEmpty
                  })
                  .map(kv => {
                    val p = kv.split(er.options("delimiter").head)
                    TaggedWord(p(0), p(1))
                  })
              }))
          .toArray
        sourceStream.close()
        result.map(TaggedSentence(_))
      case SPARK =>
        import spark.implicits._
        val dataset = spark.read.options(er.options).format(er.options("format")).load(er.path)
        val result = dataset
          .as[String]
          .filter(_.nonEmpty)
          .map(line => {
            line
              .split("\\s+")
              .filter(kv => {
                val s = kv.split(er.options("delimiter").head)
                s.length == 2 && s(0).nonEmpty && s(1).nonEmpty
              })
              .map(kv => {
                val p = kv.split(er.options("delimiter").head)
                TaggedWord(p(0), p(1))
              })
          })
          .collect
        result.map(TaggedSentence(_))
      case _ =>
        throw new Exception("Unsupported readAs")
    }
  }

  def parseTupleSentencesDS(er: ExternalResource): Dataset[TaggedSentence] = {
    er.readAs match {
      case SPARK =>
        import spark.implicits._
        val dataset = spark.read.options(er.options).format(er.options("format")).load(er.path)
        val result = dataset
          .as[String]
          .filter(_.nonEmpty)
          .map(line => {
            line
              .split("\\s+")
              .filter(kv => {
                val s = kv.split(er.options("delimiter").head)
                s.length == 2 && s(0).nonEmpty && s(1).nonEmpty
              })
              .map(kv => {
                val p = kv.split(er.options("delimiter").head)
                TaggedWord(p(0), p(1))
              })
          })
        result.map(TaggedSentence(_))
      case _ =>
        throw new Exception(
          "Unsupported readAs. If you're training POS with large dataset, consider PerceptronApproachDistributed")
    }
  }

  /** For multiple values per keys, this optimizer flattens all values for keys to have constant
    * access
    */
  def flattenRevertValuesAsKeys(er: ExternalResource): Map[String, String] = {
    er.readAs match {
      case TEXT =>
        val m: MMap[String, String] = MMap()
        val sourceStream = SourceStream(er.path)
        sourceStream.content.foreach(c =>
          c.foreach(line => {
            val kv = line.split(er.options("keyDelimiter")).map(_.trim)
            if (kv.length > 1) {
              val key = kv(0)
              val values = kv(1).split(er.options("valueDelimiter")).map(_.trim)
              values.foreach(m(_) = key)
            }
          }))
        sourceStream.close()
        m.toMap
      case SPARK =>
        import spark.implicits._
        val dataset = spark.read.options(er.options).format(er.options("format")).load(er.path)
        val valueAsKeys = MMap.empty[String, String]
        dataset
          .as[String]
          .foreach(line => {
            val kv = line.split(er.options("keyDelimiter")).map(_.trim)
            if (kv.length > 1) {
              val key = kv(0)
              val values = kv(1).split(er.options("valueDelimiter")).map(_.trim)
              values.foreach(v => valueAsKeys(v) = key)
            }
          })
        valueAsKeys.toMap
      case _ =>
        throw new Exception("Unsupported readAs")
    }
  }

  /** General purpose read saved Parquet Currently read only Parquet format
    *
    * @return
    */
  def readSparkDataFrame(er: ExternalResource): DataFrame = {
    er.readAs match {
      case SPARK =>
        val dataset = spark.read.options(er.options).format(er.options("format")).load(er.path)
        dataset
      case _ =>
        throw new Exception("Unsupported readAs - only accepts SPARK")
    }
  }

  def getWordCount(
      externalResource: ExternalResource,
      wordCount: MMap[String, Long] = MMap.empty[String, Long].withDefaultValue(0),
      pipeline: Option[PipelineModel] = None): MMap[String, Long] = {
    externalResource.readAs match {
      case TEXT =>
        val sourceStream = SourceStream(externalResource.path)
        val regex = externalResource.options("tokenPattern").r
        sourceStream.content.foreach(c =>
          c.foreach { line =>
            {
              val words: List[String] = regex.findAllMatchIn(line).map(_.matched).toList
              words.foreach(w =>
                // Creates a Map of frequency words: word -> frequency based on ExternalResource
                wordCount(w) += 1)
            }
          })
        sourceStream.close()
        if (wordCount.isEmpty)
          throw new FileNotFoundException(
            "Word count dictionary for spell checker does not exist or is empty")
        wordCount
      case SPARK =>
        import spark.implicits._
        val dataset = spark.read
          .options(externalResource.options)
          .format(externalResource.options("format"))
          .load(externalResource.path)
        val transformation = {
          if (pipeline.isDefined) {
            pipeline.get.transform(dataset)
          } else {
            val documentAssembler = new DocumentAssembler()
              .setInputCol("value")
            val tokenizer = new Tokenizer()
              .setInputCols("document")
              .setOutputCol("token")
              .setTargetPattern(externalResource.options("tokenPattern"))
            val finisher = new Finisher()
              .setInputCols("token")
              .setOutputCols("finished")
              .setAnnotationSplitSymbol("--")
            new Pipeline()
              .setStages(Array(documentAssembler, tokenizer, finisher))
              .fit(dataset)
              .transform(dataset)
          }
        }
        val wordCount = MMap.empty[String, Long].withDefaultValue(0)
        transformation
          .select("finished")
          .as[String]
          .foreach(text =>
            text
              .split("--")
              .foreach(t => {
                wordCount(t) += 1
              }))
        wordCount
      case _ => throw new IllegalArgumentException("format not available for word count")
    }
  }

  def getFilesContentBuffer(externalResource: ExternalResource): Seq[Iterator[String]] = {
    externalResource.readAs match {
      case TEXT =>
        SourceStream(externalResource.path).content
      case _ =>
        throw new Exception("Unsupported readAs")
    }
  }

  def listLocalFiles(path: String): List[File] = {
    val fileSystem = OutputHelper.getFileSystem(path)

    val filesPath = fileSystem.getScheme match {
      case "hdfs" =>
        if (path.startsWith("file:")) {
          Option(new File(path.replace("file:", "")).listFiles())
        } else {
          try {
            val filesIterator = fileSystem.listFiles(new Path(path), false)
            val files: ArrayBuffer[File] = ArrayBuffer()

            while (filesIterator.hasNext) {
              val file = new File(filesIterator.next().getPath.toString)
              files.append(file)
            }

            Option(files.toArray)
          } catch {
            case _: FileNotFoundException =>
              Option(new File(path).listFiles())
          }

        }
      case "dbfs" if path.startsWith("dbfs:") =>
        Option(new File(path.replace("dbfs:", "/dbfs/")).listFiles())
      case _ => Option(new File(path).listFiles())
    }

    val files = filesPath.getOrElse(throw new FileNotFoundException(s"folder: $path not found"))
    files.toList
  }

  def getFileFromPath(pathToFile: String): File = {
    val fileSystem = OutputHelper.getFileSystem
    val filePath = fileSystem.getScheme match {
      case "hdfs" =>
        if (pathToFile.startsWith("file:")) {
          new File(pathToFile.replace("file:", ""))
        } else new File(pathToFile)
      case "dbfs" if pathToFile.startsWith("dbfs:") =>
        new File(pathToFile.replace("dbfs:", "/dbfs/"))
      case _ => new File(pathToFile)
    }

    filePath
  }

  def validFile(path: String): Boolean = {

    if (path.isEmpty) return false

    var isValid = validLocalFile(path) match {
      case Success(value) => value
      case Failure(_) => false
    }

    if (!isValid) {
      validHadoopFile(path) match {
        case Success(value) => isValid = value
        case Failure(_) => isValid = false
      }
    }

    if (!isValid) {
      validDbfsFile(path) match {
        case Success(value) => isValid = value
        case Failure(_) => isValid = false
      }
    }

    isValid
  }

  private def validLocalFile(path: String): Try[Boolean] = Try {
    Files.exists(Paths.get(path))
  }

  private def validHadoopFile(path: String): Try[Boolean] = Try {
    val hadoopPath = new Path(path)
    val fileSystem = OutputHelper.getFileSystem
    fileSystem.exists(hadoopPath)
  }

  private def validDbfsFile(path: String): Try[Boolean] = Try {
    getFileFromPath(path).exists()
  }

}