All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.seabow.datax.common.HdfsUtils.scala Maven / Gradle / Ivy

package io.github.seabow.datax.common

import org.apache.hadoop.fs.{ContentSummary, FileStatus, FileSystem, Path}

import java.io.{BufferedInputStream, BufferedOutputStream, InputStream}
import java.util.concurrent.atomic.AtomicLong
import scala.collection.mutable.ListBuffer
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration.Duration
import scala.concurrent.{Await, Future}
import scala.util.Try

object HdfsUtils {
  implicit class StringImprovements (val s: String) {
    def toPath = new Path(s)
  }

  def hdfs(path: Path): FileSystem = FileSystem.get(path.toUri,SparkUtils.getHadoopConf())

  def readAsString(p: String): String = {
    val path = new Path(p)
     val fs=hdfs(path)
    var result = new Array[Byte](0)
    if (fs.exists(path)) {
      val inputStream = fs.open(path)
      val stat = fs.getFileStatus(path)
      val length = stat.getLen.toInt
      val buffer = new Array[Byte](length)
      inputStream.readFully(buffer)
      inputStream.close()
      result = buffer
    }
    result.map(_.toChar).mkString
  }

  def readAsByte(p: String): Array[Byte] = {
    val path = new Path(p)
    val fs=hdfs(path)

    var result = new Array[Byte](0)
    if (fs.exists(path)) {
      val inputStream = fs.open(path)
      val stat = fs.getFileStatus(path)
      val length = stat.getLen.toInt
      val buffer = new Array[Byte](length)
      inputStream.readFully(buffer)
      result = buffer
    }
    result
  }

  def writeBytes(p:String,bytes:Array[Byte]):Boolean={
    val path=new Path(p)
    val fs=hdfs(path)
    if(!fs.exists(path)){
      val dos = fs.create(path)
      dos.write(bytes, 0, bytes.length)
      dos.close()
    }
     true
  }

  def readAsInputStream(p: String): InputStream = {
    val path = new Path(p)
    val fs=hdfs(path)
    if (fs.exists(path)) {
      fs.open(path)
    } else {
      null
    }
  }

  def getLatestSubdir(path: String): String = {
    Try{
      val p=new Path(path)
      val fs=hdfs(p)
      val lastestSubdir = fs.listStatus(p).last.getPath.toString
      lastestSubdir
    }.getOrElse(path)
  }

  def listDirs(p: String): Array[FileStatus] = hdfs(p.toPath).listStatus(p.toPath).filter(_.isDirectory)

  def listFiles(p: String): Array[FileStatus] = hdfs(p.toPath).listStatus(p.toPath).filter(_.isFile)

  def listStatus(p: String): Array[FileStatus] = hdfs(p.toPath).listStatus(p.toPath)

  def exist(p: String): Boolean = hdfs(p.toPath).exists(p.toPath)

  def mkdir(path: String): Boolean = hdfs(path.toPath).mkdirs(path.toPath)

  def delete(path: String): Boolean = hdfs(path.toPath).delete(path.toPath, true)

  def status(path: String): Array[FileStatus] = hdfs(path.toPath).listStatus(path.toPath)

  def getContentSummary(path: String): ContentSummary = hdfs(path.toPath).getContentSummary(path.toPath)

  def getStatus(path: String): FileStatus = hdfs(path.toPath).getFileStatus(path.toPath)

  def touch(path: String): Boolean = {
    if (!exist(path)) {
      hdfs(path.toPath).createNewFile(path.toPath)
      true
    }
    false
  }

  def copy(srcPath:String, destPath:String):Boolean={
     val bufferedInputStream=new BufferedInputStream(readAsInputStream(srcPath))
     val buffer=new Array[Byte](4096)
     val bufferedOutputStream=new BufferedOutputStream(hdfs(destPath.toPath).create(destPath.toPath))
     try{
     var len: Int = -1
      while ( {
        len = bufferedInputStream.read(buffer); len
      } != -1) {
        bufferedOutputStream.write(buffer, 0, len)
      }}finally {
       bufferedOutputStream.close()
     }
     true
  }

  def getContentSummaryWithThreads(numThreads:Int)(path: String): ContentSummary = {
    val status = this.getStatus(path)
    if (status.isFile) {
      val length = status.getLen
      (new ContentSummary.Builder).length(length).fileCount(1L).directoryCount(0L).spaceConsumed(length).build
    }
    else {
      val summary = Array[AtomicLong](new AtomicLong(0l), new AtomicLong(0l), new AtomicLong(1l))
      val children = this.listStatus(path)
      val ec= FutureUtils.buildExecutorContext(numThreads)
      val futureTasks:ListBuffer[Future[Any]]=ListBuffer.empty[Future[Any]]
      children.foreach{
        child=>
         val future=Future{
          val c= if (child.isDirectory) this.getContentSummary(child.getPath.toString)
           else (new ContentSummary.Builder).length(child.getLen).fileCount(1L).directoryCount(0L).spaceConsumed(child.getLen).build
           summary(0).addAndGet(c.getLength)
           summary(1).addAndGet(c.getFileCount)
           summary(2).addAndGet(c.getDirectoryCount)
         }(ec)
          futureTasks.append(future)
      }
      Await.result(Future.sequence(futureTasks),Duration.Inf)
      (new ContentSummary.Builder).length(summary(0).get()).fileCount(summary(1).get()).directoryCount(summary(2).get()).spaceConsumed(summary(0).get()).build
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy