io.druid.java.util.common.CompressionUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of java-util Show documentation
java-util
There is a newer version: 0.12.3
/*
 * Licensed to Metamarkets Group Inc. (Metamarkets) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Metamarkets licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package io.druid.java.util.common;

import com.google.common.base.Predicate;
import com.google.common.base.Strings;
import com.google.common.base.Throwables;
import com.google.common.io.ByteSink;
import com.google.common.io.ByteSource;
import com.google.common.io.ByteStreams;
import com.google.common.io.Files;
import io.druid.java.util.common.logger.Logger;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Enumeration;
import java.util.concurrent.Callable;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
import java.util.zip.ZipOutputStream;

public class CompressionUtils
{
  private static final Logger log = new Logger(CompressionUtils.class);
  private static final int DEFAULT_RETRY_COUNT = 3;

  public static final String GZ_SUFFIX = ".gz";
  public static final String ZIP_SUFFIX = ".zip";

  /**
   * Zip the contents of directory into the file indicated by outputZipFile. Sub directories are skipped
   *
   * @param directory     The directory whose contents should be added to the zip in the output stream.
   * @param outputZipFile The output file to write the zipped data to
   *
   * @return The number of bytes (uncompressed) read from the input directory.
   *
   * @throws IOException
   */
  public static long zip(File directory, File outputZipFile) throws IOException
  {
    if (!isZip(outputZipFile.getName())) {
      log.warn("No .zip suffix[%s], putting files from [%s] into it anyway.", outputZipFile, directory);
    }

    try (final FileOutputStream out = new FileOutputStream(outputZipFile)) {
      return zip(directory, out);
    }
  }

  /**
   * Zips the contents of the input directory to the output stream. Sub directories are skipped
   *
   * @param directory The directory whose contents should be added to the zip in the output stream.
   * @param out       The output stream to write the zip data to. It is closed in the process
   *
   * @return The number of bytes (uncompressed) read from the input directory.
   *
   * @throws IOException
   */
  public static long zip(File directory, OutputStream out) throws IOException
  {
    if (!directory.isDirectory()) {
      throw new IOException(String.format("directory[%s] is not a directory", directory));
    }
    final File[] files = directory.listFiles();

    long totalSize = 0;
    try (final ZipOutputStream zipOut = new ZipOutputStream(out)) {
      for (File file : files) {
        log.info("Adding file[%s] with size[%,d].  Total size so far[%,d]", file, file.length(), totalSize);
        if (file.length() >= Integer.MAX_VALUE) {
          zipOut.finish();
          throw new IOException(String.format("file[%s] too large [%,d]", file, file.length()));
        }
        zipOut.putNextEntry(new ZipEntry(file.getName()));
        totalSize += Files.asByteSource(file).copyTo(zipOut);
      }
      zipOut.closeEntry();
      // Workarround for http://hg.openjdk.java.net/jdk8/jdk8/jdk/rev/759aa847dcaf
      zipOut.flush();
    }

    return totalSize;
  }

  /**
   * Unzip the byteSource to the output directory. If cacheLocally is true, the byteSource is cached to local disk before unzipping.
   * This may cause more predictable behavior than trying to unzip a large file directly off a network stream, for example.
   * * @param byteSource The ByteSource which supplies the zip data
   *
   * @param byteSource   The ByteSource which supplies the zip data
   * @param outDir       The output directory to put the contents of the zip
   * @param shouldRetry  A predicate expression to determine if a new InputStream should be acquired from ByteSource and the copy attempted again
   * @param cacheLocally A boolean flag to indicate if the data should be cached locally
   *
   * @return A FileCopyResult containing the result of writing the zip entries to disk
   *
   * @throws IOException
   */
  public static FileUtils.FileCopyResult unzip(
      final ByteSource byteSource,
      final File outDir,
      final Predicate shouldRetry,
      boolean cacheLocally
  ) throws IOException
  {
    if (!cacheLocally) {
      try {
        return RetryUtils.retry(
            new Callable()
            {
              @Override
              public FileUtils.FileCopyResult call() throws Exception
              {
                return unzip(byteSource.openStream(), outDir);
              }
            },
            shouldRetry,
            DEFAULT_RETRY_COUNT
        );
      }
      catch (Exception e) {
        throw Throwables.propagate(e);
      }
    } else {
      final File tmpFile = File.createTempFile("compressionUtilZipCache", ZIP_SUFFIX);
      try {
        FileUtils.retryCopy(
            byteSource,
            tmpFile,
            shouldRetry,
            DEFAULT_RETRY_COUNT
        );
        return unzip(tmpFile, outDir);
      }
      finally {
        if (!tmpFile.delete()) {
          log.warn("Could not delete zip cache at [%s]", tmpFile.toString());
        }
      }
    }
  }

  /**
   * Unzip the byteSource to the output directory. If cacheLocally is true, the byteSource is cached to local disk before unzipping.
   * This may cause more predictable behavior than trying to unzip a large file directly off a network stream, for example.
   *
   * @param byteSource   The ByteSource which supplies the zip data
   * @param outDir       The output directory to put the contents of the zip
   * @param cacheLocally A boolean flag to indicate if the data should be cached locally
   *
   * @return A FileCopyResult containing the result of writing the zip entries to disk
   *
   * @throws IOException
   */
  public static FileUtils.FileCopyResult unzip(
      final ByteSource byteSource,
      final File outDir,
      boolean cacheLocally
  ) throws IOException
  {
    return unzip(byteSource, outDir, FileUtils.IS_EXCEPTION, cacheLocally);
  }

  /**
   * Unzip the pulled file to an output directory. This is only expected to work on zips with lone files, and is not intended for zips with directory structures.
   *
   * @param pulledFile The file to unzip
   * @param outDir     The directory to store the contents of the file.
   *
   * @return a FileCopyResult of the files which were written to disk
   *
   * @throws IOException
   */
  public static FileUtils.FileCopyResult unzip(final File pulledFile, final File outDir) throws IOException
  {
    if (!(outDir.exists() && outDir.isDirectory())) {
      throw new ISE("outDir[%s] must exist and be a directory", outDir);
    }
    log.info("Unzipping file[%s] to [%s]", pulledFile, outDir);
    final FileUtils.FileCopyResult result = new FileUtils.FileCopyResult();
    try (final ZipFile zipFile = new ZipFile(pulledFile)) {
      final Enumeration enumeration = zipFile.entries();
      while (enumeration.hasMoreElements()) {
        final ZipEntry entry = enumeration.nextElement();
        result.addFiles(
            FileUtils.retryCopy(
                new ByteSource()
                {
                  @Override
                  public InputStream openStream() throws IOException
                  {
                    return new BufferedInputStream(zipFile.getInputStream(entry));
                  }
                },
                new File(outDir, entry.getName()),
                FileUtils.IS_EXCEPTION,
                DEFAULT_RETRY_COUNT
            ).getFiles()
        );
      }
    }
    return result;
  }

  /**
   * Unzip from the input stream to the output directory, using the entry's file name as the file name in the output directory.
   * The behavior of directories in the input stream's zip is undefined.
   * If possible, it is recommended to use unzip(ByteStream, File) instead
   *
   * @param in     The input stream of the zip data. This stream is closed
   * @param outDir The directory to copy the unzipped data to
   *
   * @return The FileUtils.FileCopyResult containing information on all the files which were written
   *
   * @throws IOException
   */
  public static FileUtils.FileCopyResult unzip(InputStream in, File outDir) throws IOException
  {
    try (final ZipInputStream zipIn = new ZipInputStream(in)) {
      final FileUtils.FileCopyResult result = new FileUtils.FileCopyResult();
      ZipEntry entry;
      while ((entry = zipIn.getNextEntry()) != null) {
        final File file = new File(outDir, entry.getName());
        Files.asByteSink(file).writeFrom(zipIn);
        result.addFile(file);
        zipIn.closeEntry();
      }
      return result;
    }
  }

  /**
   * gunzip the file to the output file.
   *
   * @param pulledFile The source of the gz data
   * @param outFile    A target file to put the contents
   *
   * @return The result of the file copy
   *
   * @throws IOException
   */
  public static FileUtils.FileCopyResult gunzip(final File pulledFile, File outFile) throws IOException
  {
    return gunzip(Files.asByteSource(pulledFile), outFile);
  }

  /**
   * Unzips the input stream via a gzip filter. use gunzip(ByteSource, File, Predicate) if possible
   *
   * @param in      The input stream to run through the gunzip filter. This stream is closed
   * @param outFile The file to output to
   *
   * @throws IOException
   */
  public static FileUtils.FileCopyResult gunzip(InputStream in, File outFile) throws IOException
  {
    try (GZIPInputStream gzipInputStream = gzipInputStream(in)) {
      Files.asByteSink(outFile).writeFrom(gzipInputStream);
      return new FileUtils.FileCopyResult(outFile);
    }
  }

  /**
   * Fixes java bug 7036144 http://bugs.java.com/bugdatabase/view_bug.do?bug_id=7036144 which affects concatenated GZip
   *
   * @param in The raw input stream
   *
   * @return A GZIPInputStream that can handle concatenated gzip streams in the input
   */
  public static GZIPInputStream gzipInputStream(final InputStream in) throws IOException
  {
    return new GZIPInputStream(
        new FilterInputStream(in)
        {
          @Override
          public int available() throws IOException
          {
            final int otherAvailable = super.available();
            // Hack. Docs say available() should return an estimate,
            // so we estimate about 1KB to work around available == 0 bug in GZIPInputStream
            return otherAvailable == 0 ? 1 << 10 : otherAvailable;
          }
        }
    );
  }

  /**
   * gunzip from the source stream to the destination stream.
   *
   * @param in  The input stream which is to be decompressed. This stream is closed.
   * @param out The output stream to write to. This stream is closed
   *
   * @return The number of bytes written to the output stream.
   *
   * @throws IOException
   */
  public static long gunzip(InputStream in, OutputStream out) throws IOException
  {
    try (GZIPInputStream gzipInputStream = gzipInputStream(in)) {
      final long result =  ByteStreams.copy(gzipInputStream, out);
      out.flush();
      return result;
    }
    finally {
      out.close();
    }
  }

  /**
   * A gunzip function to store locally
   *
   * @param in          The factory to produce input streams
   * @param outFile     The file to store the result into
   * @param shouldRetry A predicate to indicate if the Throwable is recoverable
   *
   * @return The count of bytes written to outFile
   */
  public static FileUtils.FileCopyResult gunzip(
      final ByteSource in,
      final File outFile,
      Predicate shouldRetry
  )
  {
    return FileUtils.retryCopy(
        new ByteSource()
        {
          @Override
          public InputStream openStream() throws IOException
          {
            return gzipInputStream(in.openStream());
          }
        },
        outFile,
        shouldRetry,
        DEFAULT_RETRY_COUNT
    );
  }


  /**
   * Gunzip from the input stream to the output file
   *
   * @param in      The compressed input stream to read from
   * @param outFile The file to write the uncompressed results to
   *
   * @return A FileCopyResult of the file written
   */
  public static FileUtils.FileCopyResult gunzip(final ByteSource in, File outFile)
  {
    return gunzip(in, outFile, FileUtils.IS_EXCEPTION);
  }

  /**
   * Copy inputStream to out while wrapping out in a GZIPOutputStream
   * Closes both input and output
   *
   * @param inputStream The input stream to copy data from. This stream is closed
   * @param out         The output stream to wrap in a GZIPOutputStream before copying. This stream is closed
   *
   * @return The size of the data copied
   *
   * @throws IOException
   */
  public static long gzip(InputStream inputStream, OutputStream out) throws IOException
  {
    try (GZIPOutputStream outputStream = new GZIPOutputStream(out)) {
      final long result = ByteStreams.copy(inputStream, outputStream);
      out.flush();
      return result;
    }
    finally {
      inputStream.close();
    }
  }

  /**
   * Gzips the input file to the output
   *
   * @param inFile      The file to gzip
   * @param outFile     A target file to copy the uncompressed contents of inFile to
   * @param shouldRetry Predicate on a potential throwable to determine if the copy should be attempted again.
   *
   * @return The result of the file copy
   *
   * @throws IOException
   */
  public static FileUtils.FileCopyResult gzip(final File inFile, final File outFile, Predicate shouldRetry)
      throws IOException
  {
    gzip(Files.asByteSource(inFile), Files.asByteSink(outFile), shouldRetry);
    return new FileUtils.FileCopyResult(outFile);
  }

  public static long gzip(final ByteSource in, final ByteSink out, Predicate shouldRetry)
      throws IOException
  {
    return StreamUtils.retryCopy(
        in,
        new ByteSink()
        {
          @Override
          public OutputStream openStream() throws IOException
          {
            return new GZIPOutputStream(out.openStream());
          }
        },
        shouldRetry,
        DEFAULT_RETRY_COUNT
    );
  }


  /**
   * GZip compress the contents of inFile into outFile
   *
   * @param inFile  The source of data
   * @param outFile The destination for compressed data
   *
   * @return A FileCopyResult of the resulting file at outFile
   *
   * @throws IOException
   */
  public static FileUtils.FileCopyResult gzip(final File inFile, final File outFile) throws IOException
  {
    return gzip(inFile, outFile, FileUtils.IS_EXCEPTION);
  }

  /**
   * Checks to see if fName is a valid name for a "*.zip" file
   *
   * @param fName The name of the file in question
   *
   * @return True if fName is properly named for a .zip file, false otherwise
   */
  public static boolean isZip(String fName)
  {
    if (Strings.isNullOrEmpty(fName)) {
      return false;
    }
    return fName.endsWith(ZIP_SUFFIX); // Technically a file named `.zip` would be fine
  }

  /**
   * Checks to see if fName is a valid name for a "*.gz" file
   *
   * @param fName The name of the file in question
   *
   * @return True if fName is a properly named .gz file, false otherwise
   */
  public static boolean isGz(String fName)
  {
    if (Strings.isNullOrEmpty(fName)) {
      return false;
    }
    return fName.endsWith(GZ_SUFFIX) && fName.length() > GZ_SUFFIX.length();
  }

  /**
   * Get the file name without the .gz extension
   *
   * @param fname The name of the gzip file
   *
   * @return fname without the ".gz" extension
   *
   * @throws io.druid.java.util.common.IAE if fname is not a valid "*.gz" file name
   */
  public static String getGzBaseName(String fname)
  {
    final String reducedFname = Files.getNameWithoutExtension(fname);
    if (isGz(fname) && !reducedFname.isEmpty()) {
      return reducedFname;
    }
    throw new IAE("[%s] is not a valid gz file name", fname);
  }
}