org.bom4v.ti.CopyMerge.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of delta-lake-io_2.12 Show documentation
Show all versions of delta-lake-io_2.12 Show documentation
Delta Lake I/O (in Scala and Python)
The newest version!
// Reference: https://stackoverflow.com/a/50545815/798053
package org.bom4v.ti
import scala.util.Try
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.IOUtils
import java.io.IOException
/**
* Utilities
*/
object UtilityForHadoop3 {
/**
* Re-implementation of the Hadoop CopyMerge() method,
* which has been removed in Hadoop 3
*/
def copyMergeOldWay (srcFS: FileSystem, srcDir: Path,
dstFS: FileSystem, dstFile: Path,
deleteSource: Boolean, conf: Configuration): Boolean = {
if (dstFS.exists(dstFile)) {
throw new IOException(s"Target $dstFile already exists")
}
// Source path is expected to be a directory:
if (srcFS.getFileStatus(srcDir).isDirectory()) {
val outputFile = dstFS.create(dstFile)
scala.util.Try {
srcFS
.listStatus (srcDir)
.sortBy (_.getPath.getName)
.collect {
case status if status.isFile() =>
val inputFile = srcFS.open (status.getPath())
scala.util.Try (IOUtils.copyBytes (inputFile, outputFile,
conf, false))
inputFile.close()
}
}
outputFile.close()
if (deleteSource) {
srcFS.delete (srcDir, true)
} else true
} else false
}
/**
* Re-implementation of the Hadoop CopyMerge() method,
* which has been removed in Hadoop 3.
* https://www.oreilly.com/library/view/hadoop-the-definitive/9780596521974/ch04.html
* That implementation checks for the compression codec,
* and decompresses the input files based on that codec.
* The output file is compressed with BZip2
*/
def copyMerge (
srcFS: org.apache.hadoop.fs.FileSystem,
srcDir: org.apache.hadoop.fs.Path,
dstFS: org.apache.hadoop.fs.FileSystem,
dstFile: org.apache.hadoop.fs.Path,
deleteSource: Boolean,
hadoopConfig: org.apache.hadoop.conf.Configuration
): Boolean = {
if (dstFS.exists (dstFile)) {
throw new IOException(s"Target $dstFile already exists")
}
// Source path is expected to be a directory:
if (srcFS.getFileStatus(srcDir).isDirectory()) {
val factory = new org.apache.hadoop.io.compress.
CompressionCodecFactory (hadoopConfig)
val bzCodec = new org.apache.hadoop.io.compress.BZip2Codec()
bzCodec.setConf (hadoopConfig)
val outputFile = dstFS.create (dstFile)
val outputStream = bzCodec.createOutputStream (outputFile)
val srcDirContent = srcFS.listStatus (srcDir).sortBy (_.getPath.getName)
scala.util.Try {
srcDirContent.collect {
case status if status.isFile() =>
val srcPath = status.getPath()
val srcPathStr = srcPath.getName
val csvFilePattern = new scala.util.matching.Regex ("part-.*[.]csv(|[.]deflate)$")
val csvMatch = csvFilePattern.findFirstIn (srcPathStr)
csvMatch.foreach { _ =>
// println ("Will uncompress " + srcPathStr)
val codec = factory.getCodec (srcPath)
val inputFile = srcFS.open (srcPath)
val inputStream = codec.createInputStream (inputFile)
scala.util.Try {
org.apache.hadoop.io.
IOUtils.copyBytes (inputStream, outputStream, hadoopConfig, false)
}
inputStream.close()
}
}
}
outputStream.close()
if (deleteSource) {
srcFS.delete (srcDir, true)
} else true
} else false
}
}