All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hadoop.compression.lzo.LzopCodec Maven / Gradle / Ivy

Go to download

This project is a set of plugins for Apache Hadoop that provide access to the GPL'ed compression codecs.

The newest version!
/*
 * This file is part of Hadoop-Gpl-Compression.
 *
 * Hadoop-Gpl-Compression is free software: you can redistribute it
 * and/or modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Hadoop-Gpl-Compression is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Hadoop-Gpl-Compression.  If not, see
 * .
 */

package com.hadoop.compression.lzo;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.EnumMap;
import java.util.Map;
import java.util.zip.Adler32;
import java.util.zip.Checksum;
import java.util.zip.CRC32;

import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.compress.BlockCompressorStream;
import org.apache.hadoop.io.compress.BlockDecompressorStream;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.io.compress.Decompressor;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * A {@link CompressionCodec} for a streaming
 * lzo compression/decompression pair compatible with lzop.
 * http://www.lzop.org/
 */
public class LzopCodec extends LzoCodec {

  private static final Log LOG = LogFactory.getLog(LzopCodec.class);

  /** 9 bytes at the top of every lzo file */
  private static final byte[] LZO_MAGIC = new byte[] {
    -119, 'L', 'Z', 'O', 0, '\r', '\n', '\032', '\n' };
  /** Version of lzop this emulates */
  private static final int LZOP_VERSION = 0x1010;
  /** Latest verion of lzop this should be compatible with */
  private static final int LZOP_COMPAT_VERSION = 0x0940;

  public CompressionOutputStream createOutputStream(OutputStream out,
      Compressor compressor) throws IOException {
    if (!isNativeLzoLoaded(getConf())) {
      throw new RuntimeException("native-lzo library not available");
    }
    LzoCompressor.CompressionStrategy strategy =
      LzoCompressor.CompressionStrategy.valueOf(
          getConf().get("io.compression.codec.lzo.compressor",
            LzoCompressor.CompressionStrategy.LZO1X_1.name()));
    int bufferSize =
      getConf().getInt("io.compression.codec.lzo.buffersize", 64*1024);
    return new LzopOutputStream(out, compressor, bufferSize, strategy);
  }

  public CompressionInputStream createInputStream(InputStream in,
      Decompressor decompressor) throws IOException {
    // Ensure native-lzo library is loaded & initialized
    if (!isNativeLzoLoaded(getConf())) {
      throw new RuntimeException("native-lzo library not available");
    }
    return new LzopInputStream(in, decompressor,
        getConf().getInt("io.compression.codec.lzo.buffersize", 256 * 1024));
  }

  public Decompressor createDecompressor() {
    if (!isNativeLzoLoaded(getConf())) {
      throw new RuntimeException("native-lzo library not available");
    }
    return new LzopDecompressor(getConf().getInt(
          "io.compression.codec.lzo.buffersize", 256 * 1024));
  }

  public String getDefaultExtension() {
    return ".lzo";
  }

  /**
   * Checksums on decompressed block data with header bitmask, Checksum class.
   */
  private enum DChecksum {
    F_ADLER32D(0x01, Adler32.class), F_CRC32D(0x100, CRC32.class);
    private int mask;
    private Class clazz;
    DChecksum(int mask, Class clazz) {
      this.mask = mask;
      this.clazz = clazz;
    }
    public int getHeaderMask() {
      return mask;
    }
    public Class getChecksumClass() {
      return clazz;
    }
  }

  /**
   * Checksums on compressed block data with header bitmask, Checksum class.
   */
  private enum CChecksum {
    F_ADLER32C(0x02, Adler32.class), F_CRC32C(0x200, CRC32.class);
    private int mask;
    private Class clazz;
    CChecksum(int mask, Class clazz) {
      this.mask = mask;
      this.clazz = clazz;
    }
    public int getHeaderMask() {
      return mask;
    }
    public Class getChecksumClass() {
      return clazz;
    }
  };

  protected static class LzopOutputStream extends BlockCompressorStream {

    /**
     * Write an lzop-compatible header to the OutputStream provided.
     */
    protected static void writeLzopHeader(OutputStream out,
        LzoCompressor.CompressionStrategy strategy) throws IOException {
      DataOutputBuffer dob = new DataOutputBuffer();
      try {
        dob.writeShort(LZOP_VERSION);
        dob.writeShort(LzoCompressor.LZO_LIBRARY_VERSION);
        dob.writeShort(LZOP_COMPAT_VERSION);
        switch (strategy) {
          case LZO1X_1:
            dob.writeByte(1);
            dob.writeByte(5);
            break;
          case LZO1X_15:
            dob.writeByte(2);
            dob.writeByte(1);
            break;
          case LZO1X_999:
            dob.writeByte(3);
            dob.writeByte(9);
            break;
          default:
            throw new IOException("Incompatible lzop strategy: " + strategy);
        }
        dob.writeInt(0);                                    // all flags 0
        dob.writeInt(0x81A4);                               // mode
        dob.writeInt((int)(System.currentTimeMillis() / 1000)); // mtime
        dob.writeInt(0);                                    // gmtdiff ignored
        dob.writeByte(0);                                   // no filename
        Adler32 headerChecksum = new Adler32();
        headerChecksum.update(dob.getData(), 0, dob.getLength());
        int hc = (int)headerChecksum.getValue();
        dob.writeInt(hc);
        out.write(LZO_MAGIC);
        out.write(dob.getData(), 0, dob.getLength());
      } finally {
        dob.close();
      }
    }

    public LzopOutputStream(OutputStream out, Compressor compressor,
        int bufferSize, LzoCompressor.CompressionStrategy strategy)
        throws IOException {
      super(out, compressor, bufferSize, strategy.name().contains("LZO1")
          ? (bufferSize >> 4) + 64 + 3
          : (bufferSize >> 3) + 128 + 3);
      writeLzopHeader(out, strategy);
    }

    /**
     * Close the underlying stream and write a null word to the output stream.
     */
    public void close() throws IOException {
      if (!closed) {
        finish();
        out.write(new byte[]{ 0, 0, 0, 0 });
        out.close();
        closed = true;
      }
    }

  }

  protected static class LzopInputStream extends BlockDecompressorStream {

    private EnumSet dflags = EnumSet.allOf(DChecksum.class);
    private EnumSet cflags = EnumSet.allOf(CChecksum.class);

    private final byte[] buf = new byte[9];
    private EnumMap dcheck
      = new EnumMap(DChecksum.class);
    private EnumMap ccheck
      = new EnumMap(CChecksum.class);

    public LzopInputStream(InputStream in, Decompressor decompressor,
        int bufferSize) throws IOException {
      super(in, decompressor, bufferSize);
      readHeader(in);
    }

    /**
     * Read len bytes into buf, st LSB of int returned is the last byte of the
     * first word read.
     */
    private static int readInt(InputStream in, byte[] buf, int len) 
        throws IOException {
      if (0 > in.read(buf, 0, len)) {
        throw new EOFException();
      }
      int ret = (0xFF & buf[0]) << 24;
      ret    |= (0xFF & buf[1]) << 16;
      ret    |= (0xFF & buf[2]) << 8;
      ret    |= (0xFF & buf[3]);
      return (len > 3) ? ret : (ret >>> (8 * (4 - len)));
    }

    /**
     * Read bytes, update checksums, return first four bytes as an int, first
     * byte read in the MSB.
     */
    private static int readHeaderItem(InputStream in, byte[] buf, int len,
        Adler32 adler, CRC32 crc32) throws IOException {
      int ret = readInt(in, buf, len);
      adler.update(buf, 0, len);
      crc32.update(buf, 0, len);
      Arrays.fill(buf, (byte)0);
      return ret;
    }

    /**
     * Read and verify an lzo header, setting relevant block checksum options
     * and ignoring most everything else.
     */
    protected void readHeader(InputStream in) throws IOException {
      if (0 > in.read(buf, 0, 9)) {
        throw new EOFException();
      }
      if (!Arrays.equals(buf, LZO_MAGIC)) {
        throw new IOException("Invalid LZO header");
      }
      Arrays.fill(buf, (byte)0);
      Adler32 adler = new Adler32();
      CRC32 crc32 = new CRC32();
      int hitem = readHeaderItem(in, buf, 2, adler, crc32); // lzop version
      if (hitem > LZOP_VERSION) {
        LOG.debug("Compressed with later version of lzop: " +
            Integer.toHexString(hitem) + " (expected 0x" +
            Integer.toHexString(LZOP_VERSION) + ")");
      }
      hitem = readHeaderItem(in, buf, 2, adler, crc32); // lzo library version
      if (hitem > LzoDecompressor.LZO_LIBRARY_VERSION) {
        throw new IOException("Compressed with incompatible lzo version: 0x" +
            Integer.toHexString(hitem) + " (expected 0x" +
            Integer.toHexString(LzoDecompressor.LZO_LIBRARY_VERSION) + ")");
      }
      hitem = readHeaderItem(in, buf, 2, adler, crc32); // lzop extract version
      if (hitem > LZOP_VERSION) {
        throw new IOException("Compressed with incompatible lzop version: 0x" +
            Integer.toHexString(hitem) + " (expected 0x" +
            Integer.toHexString(LZOP_VERSION) + ")");
      }
      hitem = readHeaderItem(in, buf, 1, adler, crc32); // method
      if (hitem < 1 || hitem > 3) {
          throw new IOException("Invalid strategy: " +
              Integer.toHexString(hitem));
      }
      readHeaderItem(in, buf, 1, adler, crc32); // ignore level

      // flags
      hitem = readHeaderItem(in, buf, 4, adler, crc32);
      try {
        for (DChecksum f : dflags) {
          if (0 == (f.getHeaderMask() & hitem)) {
            dflags.remove(f);
          } else {
            dcheck.put(f, (int)f.getChecksumClass().newInstance().getValue());
          }
        }
        for (CChecksum f : cflags) {
          if (0 == (f.getHeaderMask() & hitem)) {
            cflags.remove(f);
          } else {
            ccheck.put(f, (int)f.getChecksumClass().newInstance().getValue());
          }
        }
      } catch (InstantiationException e) {
        throw new RuntimeException("Internal error", e);
      } catch (IllegalAccessException e) {
        throw new RuntimeException("Internal error", e);
      }
      ((LzopDecompressor)decompressor).initHeaderFlags(dflags, cflags);
      boolean useCRC32 = 0 != (hitem & 0x00001000);   // F_H_CRC32
      boolean extraField = 0 != (hitem & 0x00000040); // F_H_EXTRA_FIELD
      if (0 != (hitem & 0x400)) {                     // F_MULTIPART
        throw new IOException("Multipart lzop not supported");
      }
      if (0 != (hitem & 0x800)) {                     // F_H_FILTER
        throw new IOException("lzop filter not supported");
      }
      if (0 != (hitem & 0x000FC000)) {                // F_RESERVED
        throw new IOException("Unknown flags in header");
      }
      // known !F_H_FILTER, so no optional block

      readHeaderItem(in, buf, 4, adler, crc32); // ignore mode
      readHeaderItem(in, buf, 4, adler, crc32); // ignore mtime
      readHeaderItem(in, buf, 4, adler, crc32); // ignore gmtdiff
      hitem = readHeaderItem(in, buf, 1, adler, crc32); // fn len
      if (hitem > 0) {
        // skip filename
        readHeaderItem(in, new byte[hitem], hitem, adler, crc32);
      }
      int checksum = (int)(useCRC32 ? crc32.getValue() : adler.getValue());
      hitem = readHeaderItem(in, buf, 4, adler, crc32); // read checksum
      if (hitem != checksum) {
        throw new IOException("Invalid header checksum: " +
            Long.toHexString(checksum) + " (expected 0x" +
            Integer.toHexString(hitem) + ")");
      }
      if (extraField) { // lzop 1.08 ultimately ignores this
        LOG.debug("Extra header field not processed");
        adler.reset();
        crc32.reset();
        hitem = readHeaderItem(in, buf, 4, adler, crc32);
        readHeaderItem(in, new byte[hitem], hitem, adler, crc32);
        checksum = (int)(useCRC32 ? crc32.getValue() : adler.getValue());
        if (checksum != readHeaderItem(in, buf, 4, adler, crc32)) {
          throw new IOException("Invalid checksum for extra header field");
        }
      }
    }

    /**
     * Take checksums recorded from block header and verify them against
     * those recorded by the decomrpessor.
     */
    private void verifyChecksums() throws IOException {
      LzopDecompressor ldecompressor = ((LzopDecompressor)decompressor);
      for (Map.Entry chk : dcheck.entrySet()) {
        if (!ldecompressor.verifyDChecksum(chk.getKey(), chk.getValue())) {
          throw new IOException("Corrupted uncompressed block");
        }
      }
      for (Map.Entry chk : ccheck.entrySet()) {
        if (!ldecompressor.verifyCChecksum(chk.getKey(), chk.getValue())) {
          throw new IOException("Corrupted compressed block");
        }
      }
    }

    /**
     * Read checksums and feed compressed block data into decompressor.
     */
    protected void getCompressedData() throws IOException {
      checkStream();

      LzopDecompressor ldecompressor = (LzopDecompressor)decompressor;

      // Get the size of the compressed chunk
      int len = readInt(in, buf, 4);

      verifyChecksums();

      for (DChecksum chk : dcheck.keySet()) {
        dcheck.put(chk, readInt(in, buf, 4));
      }
      for (CChecksum chk : ccheck.keySet()) {
        // NOTE: if the compressed size is not less than the uncompressed
        //       size, this value is not present and decompression will fail.
        //       Fortunately, checksums on compressed data are rare, as is
        //       this case.
        ccheck.put(chk, readInt(in, buf, 4));
      }

      ldecompressor.resetChecksum();

      // Read len bytes from underlying stream
      if (len > buffer.length) {
        buffer = new byte[len];
      }
      int n = 0, off = 0;
      while (n < len) {
        int count = in.read(buffer, off + n, len - n);
        if (count < 0) {
          throw new EOFException();
        }
        n += count;
      }

      // Send the read data to the decompressor
      decompressor.setInput(buffer, 0, len);
    }

    public void close() throws IOException {
      super.close();
      verifyChecksums();
    }
  }

  public static class LzopDecompressor extends LzoDecompressor {

    private EnumMap chkDMap =
      new EnumMap(DChecksum.class);
    private EnumMap chkCMap =
      new EnumMap(CChecksum.class);

    /**
     * Create an LzoDecompressor with LZO1X strategy (the only lzo algorithm
     * supported by lzop).
     */
    public LzopDecompressor(int bufferSize) {
      super(LzoDecompressor.CompressionStrategy.LZO1X_SAFE, bufferSize);
    }

    /**
     * Given a set of decompressed and compressed checksums, 
     */
    public void initHeaderFlags(EnumSet dflags,
        EnumSet cflags) {
      try {
        for (DChecksum flag : dflags) {
          chkDMap.put(flag, flag.getChecksumClass().newInstance());
        }
        for (CChecksum flag : cflags) {
          chkCMap.put(flag, flag.getChecksumClass().newInstance());
        }
      } catch (InstantiationException e) {
        throw new RuntimeException("Internal error", e);
      } catch (IllegalAccessException e) {
        throw new RuntimeException("Internal error", e);
      }
    }

    /**
     * Get the number of checksum implementations
     * the current lzo file uses.
     * @return Number of checksum implementations in use.
     */
    public int getChecksumsCount() {
      return this.chkCMap.size() + this.chkDMap.size();
    }
    
    /**
     * Reset all checksums registered for this decompressor instance.
     */
    public synchronized void resetChecksum() {
      for (Checksum chk : chkDMap.values()) chk.reset();
      for (Checksum chk : chkCMap.values()) chk.reset();
    }

    /**
     * Given a checksum type, verify its value against that observed in
     * decompressed data.
     */
    public synchronized boolean verifyDChecksum(DChecksum typ, int checksum) {
      return (checksum == (int)chkDMap.get(typ).getValue());
    }

    /**
     * Given a checksum type, verity its value against that observed in
     * compressed data.
     */
    public synchronized boolean verifyCChecksum(CChecksum typ, int checksum) {
      return (checksum == (int)chkCMap.get(typ).getValue());
    }

    public synchronized void setInput(byte[] b, int off, int len) {
      for (Checksum chk : chkCMap.values()) chk.update(b, off, len);
      super.setInput(b, off, len);
    }

    public synchronized int decompress(byte[] b, int off, int len)
        throws IOException {
      int ret = super.decompress(b, off, len);
      if (ret > 0) {
        for (Checksum chk : chkDMap.values()) chk.update(b, off, ret);
      }
      return ret;
    }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy