com.johnsnowlabs.storage.StorageHelper.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-nlp_2.12 Show documentation
spark-nlp
The newest version!
/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.storage

import com.johnsnowlabs.client.CloudResources
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkContext, SparkFiles}

import java.io.File

object StorageHelper {

  def resolveStorageName(database: String, storageRef: String): String =
    new Path(database + "_" + storageRef).toString

  def load(
      storageSourcePath: String,
      spark: SparkSession,
      database: String,
      storageRef: String,
      withinStorage: Boolean): RocksDBConnection = {

    val dbFolder = StorageHelper.resolveStorageName(database, storageRef)
    val source = StorageLocator.getStorageSerializedPath(
      storageSourcePath.replaceAllLiterally("\\", "/"),
      dbFolder,
      withinStorage)

    val locator = StorageLocator(database, storageRef, spark)
    sendToCluster(
      source,
      locator.clusterFilePath,
      locator.clusterFileName,
      locator.destinationScheme,
      spark.sparkContext)

    RocksDBConnection.getOrCreate(locator.clusterFileName)
  }

  def save(
      path: String,
      connection: RocksDBConnection,
      spark: SparkSession,
      withinStorage: Boolean): Unit = {
    val indexUri = "file://" + (new java.net.URI(
      connection.findLocalIndex.replaceAllLiterally("\\", "/")).getPath)
    val index = new Path(indexUri)

    val uri = new java.net.URI(path.replaceAllLiterally("\\", "/"))
    val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration)
    val dst = new Path(path + {
      if (withinStorage) "/storage/" else ""
    })

    save(fs, index, dst)
  }

  private def save(fs: FileSystem, index: Path, dst: Path): Unit = {
    if (!fs.exists(dst))
      fs.mkdirs(dst)
    fs.copyFromLocalFile(false, true, index, dst)
  }

  def sendToCluster(
      source: Path,
      clusterFilePath: Path,
      clusterFileName: String,
      destinationScheme: String,
      sparkContext: SparkContext): Unit = {
    destinationScheme match {
      case "file" => {
        val sourceFileSystemScheme = source.getFileSystem(sparkContext.hadoopConfiguration)
        val tmpIndexStorageLocalPath =
          RocksDBConnection.getTmpIndexStorageLocalPath(clusterFileName)
        sourceFileSystemScheme.getScheme match {
          case "file" => {
            if (!doesDirectoryExistJava(tmpIndexStorageLocalPath) ||
              !doesDirectoryExistHadoop(tmpIndexStorageLocalPath, sparkContext)) {
              copyIndexToLocal(source, new Path(tmpIndexStorageLocalPath), sparkContext)
            }
          }
          case "s3a" =>
            copyIndexToLocal(source, new Path(tmpIndexStorageLocalPath), sparkContext)
          case _ => copyIndexToCluster(source, clusterFilePath, sparkContext)
        }
      }
      case _ => {
        copyIndexToCluster(source, clusterFilePath, sparkContext)
      }
    }
  }

  private def doesDirectoryExistJava(path: String): Boolean = {
    val directory = new File(path)
    directory.exists && directory.isDirectory
  }

  private def doesDirectoryExistHadoop(path: String, sparkContext: SparkContext): Boolean = {
    val localPath = new Path(path)
    val fileSystem = localPath.getFileSystem(sparkContext.hadoopConfiguration)
    fileSystem.exists(localPath)
  }

  private def copyIndexToCluster(
      sourcePath: Path,
      dst: Path,
      sparkContext: SparkContext): String = {
    if (!new File(SparkFiles.get(dst.getName)).exists()) {
      val srcFS = sourcePath.getFileSystem(sparkContext.hadoopConfiguration)
      val dstFS = dst.getFileSystem(sparkContext.hadoopConfiguration)

      if (srcFS.getScheme == "file") {
        val src = sourcePath
        dstFS.copyFromLocalFile(false, true, src, dst)
      } else {
        FileUtil.copy(
          srcFS,
          sourcePath,
          dstFS,
          dst,
          false,
          true,
          sparkContext.hadoopConfiguration)
      }

      sparkContext.addFile(dst.toString, recursive = true)
    }
    dst.toString
  }

  private def copyIndexToLocal(
      source: Path,
      destination: Path,
      sparkContext: SparkContext): Unit = {

    /** if we don't do a copy, and just move, it will all fail when re-saving utilized storage
      * because of bad crc
      */
    val fileSystemDestination = destination.getFileSystem(sparkContext.hadoopConfiguration)
    val fileSystemSource = source.getFileSystem(sparkContext.hadoopConfiguration)

    if (fileSystemSource.getScheme == "file") {
      fileSystemDestination.copyFromLocalFile(false, true, source, destination)
    } else {
      CloudResources.downloadBucketToLocalTmp(
        source.toString,
        destination.toString,
        isIndex = true)
      val isLocalMode = sparkContext.master.startsWith("local")
      if (isLocalMode) {
        sparkContext.addFile(destination.toString, recursive = true)
      }
    }
  }

}