e_2.12.dp.source-code.archive.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of archive_2.12 Show documentation
drx support for more compression and archive formats via third party libs
The newest version!
/*
   Copyright 2010 Aaron J. Radke

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/
package cc.drx

// import net.jpountz.lz4
import org.apache.commons.compress.archivers
import org.apache.commons.compress.compressors
import archivers.ArchiveEntry //TODO can this be a generic one that is part of drx?????
import archivers.ArchiveInputStream
import archivers.ArchiveOutputStream
import java.util.zip.GZIPInputStream
import java.util.zip.GZIPOutputStream

import java.io.{InputStream => JIS}
import java.io.{OutputStream => JOS}

/*
TODO generalize the ArchiveReader and ArchiveWriter classes so it will work on zip.ZipInputStream files without depending on apache commons
TODO unify the File and Archive api's  using / and glob like formats
  this will also offer a more general entry point from File director instead of Archive
TODO use commons compress lz4 instead of jpountz ??
TODO add brotli br file reading
TODO add snappy read/write format
TODO add 7z in memory/file walking
*/

/*
 *  Extract an archive to the current directory:
 *  Archive("large.tgz").extract
 *
 *  Extract an archive to a specified directory:
 *  Archive("large.tgz").extract(File("large")
 * 
 *  Extract a tar ball but stream the files out as GZ
 *  Archive("large.tgz").extract(File("."),Archive.GZ)}
 *  for(f <- Archive("large.tgz")) {print(s"%-50s  " format f.file.path); Console.flush; println(IO.lines(f.is,autoClose=false).size)}
 *
 *  Make and write to an archive
 *  val entries = Seq(FileEntry("test.txt", "simple  text"), FileEntry("test2.txt", "simple 2 test"))
 *  Archive.write("test2.tar"){a => entries foreach {a write _}}
 */
object Archive{

   private def mkEntryFromStream(entry:ArchiveEntry,input:Input):PathEntry = {
     val modDate = Try{Date(entry.getLastModifiedDate)}  getOrElse Date.now //TODO what date should be the failed time?
     if(entry.isDirectory) DirEntry(File(entry.getName), modDate)
     else                 new FileEntry(File(entry.getName), ()=>input, Bytes(entry.getSize), modDate)
   }

   class ArchiveReader(ais:ArchiveInputStream) extends Iterator[PathEntry] {
      private var opened = true
      private var autoClose = true
      private var entry:ArchiveEntry = null
      def keepOpen:ArchiveReader = {autoClose = false; this}
      def hasNext:Boolean = {
         entry = ais.getNextEntry //null if no more entries, java.io.EOFException if it canot be read
         if(entry != null) true else {
            if(opened && autoClose) close
            false
         }
      }
      def next:PathEntry = mkEntryFromStream(entry,Input(ais).keepOpen) //TODO is this keepOpen needed or best placed here?

      def close = {opened = false; ais.close}

      def extract(outputDir:File=File("."), streamGenerator:StreamGenerator=StreamGenerator.Normal):Unit = foreach{
         case e:FileEntry => {
            val ext = streamGenerator.exts.headOption
            val f = if(ext.isDefined) File(outputDir.path + "/" + e.file.path + "." + ext.get)
                    else outputDir / e.file
            println(s"Extracting to $f")
            //f.out.as(streamGenerator).write{os => IO.copy(e.is, os, false)}
            e.in.keepOpen copy f.out.as(streamGenerator)
         }
         case d:DirEntry => d.file.file.mkdirs //java mkdirs
      }

      /**iterate all dirs and files*/
      def paths:Iterator[PathEntry] = this  //this should iterate all paths
      /**filter paths by files only (skip dirs)*/
      def files:Iterator[FileEntry] = collect{case f:FileEntry => f}  //filter by File entry only

      def walk(parent:FileEntry)(implicit lib:ArchiveLib):Iterator[FileEntry] = files.flatMap{ f =>
        Archive.tryWalk(Try(lib reader f), new DeepFileEntry(parent,f) )
      }

   }
   class ArchiveWriter(val aos:ArchiveOutputStream,ag:ArchiveGenerator){
      def write(writeFunc:ArchiveWriter => Unit) = { writeFunc(this); close }
      def write(entries:Iterable[PathEntry]):Unit = write{_ ++= entries}
      final def +=(entry:PathEntry) = add(entry)
      final def ++=(entries:Iterable[PathEntry]) = entries foreach (add _)
      final def add(entry:PathEntry) = {
         aos.putArchiveEntry(ag.entry(entry))
         entry match{
            case e:FileEntry => e.in.keepOpen copy Output(aos).keepOpen
            case _ => 
         }
         aos.closeArchiveEntry
      }
      def close:Unit = aos.close
      def finish:Unit = aos.finish
   }

   trait ArchiveGenerator extends IOGenerator[ArchiveReader,ArchiveWriter]{
      def entry(e:PathEntry):ArchiveEntry

      /**forward the FileEntry to the default cc.drx.File test for canRead */
      def canRead(f:FileEntry):Boolean = canRead(f.file)  

      def reader( f:FileEntry):ArchiveReader = apply(f.in)
      // def canWrite(f:FileEntry) = canWrite(f.file)
      // def writer( f:FileEntry) = apply(f) //for now writing has only been tested to files, but IO stream has been tested both directions

      def reader(f:File):ArchiveReader = apply(f.in.is)
      def writer(f:File):ArchiveWriter = apply(f.out.os)
   }
   case object TBZ2 extends ArchiveGenerator {
      val exts = List("tbz2","tar.bz2")
      def apply(jis:JIS) = new ArchiveReader(new archivers.tar.TarArchiveInputStream(new compressors.bzip2.BZip2CompressorInputStream(jis)))
      def apply(jos:JOS) = new ArchiveWriter(new archivers.tar.TarArchiveOutputStream(new compressors.bzip2.BZip2CompressorOutputStream(jos)),this)
      def entry(e:PathEntry):ArchiveEntry = {val t = new archivers.tar.TarArchiveEntry(e.file.path); t.setSize(e.size.byteCount); t}
   }
   case object TLZ4 extends ArchiveGenerator {
      val exts = List("tlz4","tar.lz4")
      def apply(jis:JIS) = new ArchiveReader(new archivers.tar.TarArchiveInputStream(new compressors.lz4.BlockLZ4CompressorInputStream(jis)))
      def apply(jos:JOS) = new ArchiveWriter(new archivers.tar.TarArchiveOutputStream(new compressors.lz4.BlockLZ4CompressorOutputStream(jos)),this)
      def entry(e:PathEntry):ArchiveEntry = {val t = new archivers.tar.TarArchiveEntry(e.file.path); t.setSize(e.size.byteCount); t}
   }
   case object TGZ extends ArchiveGenerator {
      val exts = List("tgz","tar.gz")
      def apply(jis:JIS) = new ArchiveReader(new archivers.tar.TarArchiveInputStream(new GZIPInputStream(jis)))
      def apply(jos:JOS) = new ArchiveWriter(new archivers.tar.TarArchiveOutputStream(new GZIPOutputStream(jos)),this)
      def entry(e:PathEntry):ArchiveEntry = {val t = new archivers.tar.TarArchiveEntry(e.file.path); t.setSize(e.size.byteCount); t}
   }
   case object TAR extends ArchiveGenerator {
      val exts = List("tar")
      def apply(jis:JIS) = new ArchiveReader(new archivers.tar.TarArchiveInputStream(jis))
      def apply(jos:JOS) = new ArchiveWriter(new archivers.tar.TarArchiveOutputStream(jos),this)
      def entry(e:PathEntry):ArchiveEntry = {val t = new archivers.tar.TarArchiveEntry(e.file.path); t.setSize(e.size.byteCount); t}
   }
   case object ZIP extends ArchiveGenerator {
      val exts = List("zip")
      def apply(jis:JIS) = new ArchiveReader(new archivers.zip.ZipArchiveInputStream(jis))
      def apply(jos:JOS) = new ArchiveWriter(new archivers.zip.ZipArchiveOutputStream(jos),this)
      def entry(e:PathEntry):ArchiveEntry = new archivers.zip.ZipArchiveEntry(e.file.path)
   }
   case object JAR extends ArchiveGenerator {
      val exts = List("jar")
      def apply(jis:JIS) = new ArchiveReader(new archivers.jar.JarArchiveInputStream(jis))
      def apply(jos:JOS) = new ArchiveWriter(new archivers.jar.JarArchiveOutputStream(jos),this)
      def entry(e:PathEntry):ArchiveEntry = new archivers.jar.JarArchiveEntry(e.file.path)
   }
   case object AR extends ArchiveGenerator {
      val exts = List("ar")
      def apply(jis:JIS) = new ArchiveReader(new archivers.ar.ArArchiveInputStream(jis))
      def apply(jos:JOS) = new ArchiveWriter(new archivers.ar.ArArchiveOutputStream(jos),this)
      def entry(e:PathEntry):ArchiveEntry = new archivers.ar.ArArchiveEntry(e.file.path, e.size.byteCount)
   }
   case object ARJ extends ArchiveGenerator {
      val exts = List("arj")
      def apply(jis:JIS) = new ArchiveReader(new archivers.arj.ArjArchiveInputStream(jis))
      override def canWrite = false
      def apply(jos:JOS) = ???
      def entry(e:PathEntry):ArchiveEntry = ???
   }
   case object CPIO extends ArchiveGenerator {
      val exts = List("arj")
      def apply(jis:JIS) = new ArchiveReader(new archivers.cpio.CpioArchiveInputStream(jis))
      override def canWrite = false //FIXME error writing with past end of STORED entry
      def apply(jos:JOS) = new ArchiveWriter(new archivers.cpio.CpioArchiveOutputStream(jos),this)
      def entry(e:PathEntry):ArchiveEntry = new archivers.cpio.CpioArchiveEntry(e.file.path)
   }
   case object DUMP extends ArchiveGenerator {
      val exts = List("dump")
      def apply(jis:JIS) = new ArchiveReader(new archivers.dump.DumpArchiveInputStream(jis))
      override def canWrite = false
      def apply(jos:JOS) = ??? //read-only support
      def entry(e:PathEntry):ArchiveEntry = ???
   }
   case object SEVENZ extends ArchiveGenerator {
      import archivers.sevenz._//{SevenZFile, Seve SevenZMethod}
      val exts = List("7z")
      override def canRead(f:File):Boolean = canGenerate(f) && f.isFile  //the file exists and is not a stream..

      /**these classes are required to simulate 7z file streaming like the rest of the commons-compress api*/
      class SevenZArchiveInputStream(f:File) extends ArchiveInputStream {
         private val zf = new SevenZFile(f.file)
         override def close() = zf.close()
         def getNextEntry() = zf.getNextEntry()
         override def read(b:Array[Byte], off:Int, len:Int) = zf.read(b,off,len)
      }
      /**these classes are required to simulate 7z file streaming like the rest of the commons-compress api*/
      class SevenZArchiveOutputStream(f:File) extends ArchiveOutputStream {
         private val zf = new SevenZOutputFile(f.file)
         override def close() = zf.close()
         def putArchiveEntry(e:ArchiveEntry) = zf.putArchiveEntry(e)
         def closeArchiveEntry() = zf.closeArchiveEntry()
         def createArchiveEntry(jf:java.io.File,name:String) = zf.createArchiveEntry(jf,name)
         def finish() = zf.finish()
         override def write(b:Array[Byte], off:Int, len:Int) = zf.write(b,off,len)
         def setContentCompression(m:SevenZMethod) = zf.setContentCompression(m)
      }

      override def reader(f:File) = new ArchiveReader(new SevenZArchiveInputStream(f))
      override def writer(f:File) = new ArchiveWriter(new SevenZArchiveOutputStream(f),this)

      //def apply(is:JIS) = new ArchiveReader(new archivers.sevenz.Seven7ArchiveInputStream(is))
      import archivers.ArchiveStreamFactory
      def apply(is:JIS) = new ArchiveReader(new ArchiveStreamFactory().createArchiveInputStream(ArchiveStreamFactory.SEVEN_Z, is)) //TODO this does not support streaming... try copy to a tmp dir for sevenz file access
      def apply(os:JOS) = new ArchiveWriter(new ArchiveStreamFactory().createArchiveOutputStream(ArchiveStreamFactory.SEVEN_Z, os),this) //TODO this does not support streaming..

      def entry(e:PathEntry):ArchiveEntry = {val t = new archivers.sevenz.SevenZArchiveEntry(); t.setName(e.file.path); t.setSize(e.size.byteCount); t}
   }

   trait ArchiveLib{
     val libs:Set[ArchiveGenerator]
     def reader(f:FileEntry):Option[ArchiveReader] = libs.find(_ canRead  f).map{_.reader(f)}
     // def writer(f:FileEntry):Option[ArchiveWriter] = libs.find(_ canWrite f).map{_.writer(f)}
     def reader(f:File):Option[ArchiveReader]      = libs.find(_ canRead  f).map{_.reader(f)}  //this is a special version that prevents streaming and uses direct file access
     def writer(f:File):Option[ArchiveWriter]      = libs.find(_ canWrite f).map{_.writer(f)}
   }
   object ArchiveLib{
     //implicits are first searched for within an object of the same name (so no imports are necessary)
     implicit object DefaultArchiveLib extends ArchiveLib{
       val libs:Set[ArchiveGenerator] = Set(TBZ2,TLZ4,TGZ,TAR,ZIP,JAR,AR,ARJ,CPIO,DUMP,SEVENZ)
     }
   }

   /**function to provide shared code with deepFileEntry and FileEntry, so it requires the strange fe geneerator argument*/
   protected def tryWalk(reader:Try[Option[ArchiveReader]], fe: =>FileEntry):Iterator[FileEntry] = {
     reader match {
       //-- If the file is a known archive format try walking it
       case Success(Some(r)) =>
        Try{r.keepOpen.walk(fe)} match {
          case Success(it) => it
          //if a failure occurred during the walk then skip and let us know
          case Failure(e) =>
            Console.err.println(s"Warning: Archive.walk skipping $fe during walk because: $e")
            Iterator()
        }
       //--When not an archive file, just return the single deep file entry
       case Success(None)    => 
         Iterator(fe)
       //--On failure to open an archive
       case Failure(e)       => 
         Console.err.println(s"Warning: Archive.walk skipping $fe during open because: $e")
         Iterator(fe)
     }
   }

   /** loop through directories looking for all files including archive files to unzip and stream */
   def walk(startingFile:File)(implicit lib:ArchiveLib):Iterator[FileEntry] = {
     if(!startingFile.isDir && !startingFile.isFile){
       Console.err.println(s"Warning: in Archive.walk start file does not exist: $startingFile")
     }
     startingFile.walk.flatMap{f =>
        if(f.isDir) Archive.walk(f) //recursive dir and archive search
        else tryWalk( Try(lib reader f), FileEntry(f))
     }
   }

   /** loop through directories like walk, but first dive through all the files counting the number of files for a progress*/
   def walkWithProgress(startingFile:File)(implicit lib:ArchiveLib):Iterator[(Ratio,FileEntry)] = {
      //to string = s"[$progress] ${file.fullPath}"
      def files = Archive.walk(startingFile)
      var total = 0
      val counting = for((f,i) <- files.zipWithIndex) yield {total += 1; (Ratio(0,i),f)}
      val listing = for((f,i) <- files.zipWithIndex) yield (Ratio(i+1, total), f)
      counting ++ listing
   }

}

//TODO maybe add implicit from File ???

case class Archive(f:File)(implicit lib:Archive.ArchiveLib){
   import Archive._
   private def in = lib.reader(f) getOrElse ZIP(f.in.is)
   private def out = lib.writer(f) getOrElse ZIP(f.out.os)

   def write(writeFunc:ArchiveWriter => Unit) = out.write(writeFunc)
   def walk:Iterator[FileEntry] = Archive.walk(f)  //deep walk open enclosing zip files
   def walkWithProgress = Archive.walkWithProgress(f)
   //TODO rename list to paths
   def list:Iterator[PathEntry] = in.paths //list all the entries
   def files:Iterator[FileEntry] = in.files //list all the files entries
   // def iterator = in.iterator
}