All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.airlift.compress.lzo.HadoopLzopInputStream Maven / Gradle / Ivy

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.airlift.compress.lzo;

import org.apache.hadoop.io.compress.CompressionInputStream;

import java.io.ByteArrayInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.zip.Adler32;
import java.util.zip.CRC32;
import java.util.zip.Checksum;

import static io.airlift.compress.lzo.LzoConstants.SIZE_OF_LONG;
import static io.airlift.compress.lzo.LzopCodec.LZOP_MAGIC;
import static io.airlift.compress.lzo.LzopCodec.LZO_1X_VARIANT;
import static java.lang.String.format;

class HadoopLzopInputStream
        extends CompressionInputStream
{
    private static final int LZO_VERSION_MAX = 0x20A0;
    private static final int LZOP_FILE_VERSION_MIN = 0x0940;
    private static final int LZOP_FORMAT_VERSION_MAX = 0x1010;

    private static final int LZOP_FLAG_ADLER32_DECOMPRESSED = 0x0000_0001;
    private static final int LZOP_FLAG_ADLER32_COMPRESSED = 0x0000_0002;
    private static final int LZOP_FLAG_CRC32_DECOMPRESSED = 0x0000_0100;
    private static final int LZOP_FLAG_CRC32_COMPRESSED = 0x0000_0200;
    private static final int LZOP_FLAG_CRC32_HEADER = 0x0000_1000;
    private static final int LZOP_FLAG_IO_MASK = 0x0000_000c;
    private static final int LZOP_FLAG_OPERATING_SYSTEM_MASK = 0xff00_0000;
    private static final int LZOP_FLAG_CHARACTER_SET_MASK = 0x00f0_0000;

    private final LzoDecompressor decompressor = new LzoDecompressor();
    private final InputStream in;
    private final byte[] uncompressedChunk;

    private int uncompressedLength;
    private int uncompressedOffset;

    private boolean finished;

    private byte[] compressed = new byte[0];
    private final boolean adler32Decompressed;
    private final boolean adler32Compressed;
    private final boolean crc32Decompressed;
    private final boolean crc32Compressed;

    public HadoopLzopInputStream(InputStream in, int maxUncompressedLength)
            throws IOException
    {
        super(in);
        this.in = in;
        // over allocate buffer which makes decompression easier
        uncompressedChunk = new byte[maxUncompressedLength + SIZE_OF_LONG];

        byte[] magic = new byte[LZOP_MAGIC.length];
        readInput(magic, 0, magic.length);
        if (!Arrays.equals(magic, LZOP_MAGIC)) {
            throw new IOException("Not an LZOP file");
        }

        byte[] header = new byte[25];
        readInput(header, 0, header.length);
        ByteArrayInputStream headerStream = new ByteArrayInputStream(header);

        // lzop version: ignored
        int lzopFileVersion = readBigEndianShort(headerStream);
        if (lzopFileVersion < LZOP_FILE_VERSION_MIN) {
            throw new IOException(format("Unsupported LZOP file version 0x%08X", lzopFileVersion));
        }

        // lzo version
        int lzoVersion = readBigEndianShort(headerStream);
        if (lzoVersion > LZO_VERSION_MAX) {
            throw new IOException(format("Unsupported LZO version 0x%08X", lzoVersion));
        }

        // lzop version of the format
        int lzopFormatVersion = readBigEndianShort(headerStream);
        if (lzopFormatVersion > LZOP_FORMAT_VERSION_MAX) {
            throw new IOException(format("Unsupported LZOP format version 0x%08X", lzopFormatVersion));
        }

        // variant: must be LZO 1X
        int variant = headerStream.read();
        if (variant != LZO_1X_VARIANT) {
            throw new IOException(format("Unsupported LZO variant %s", variant));
        }

        // level: ignored
        headerStream.read();

        // flags
        int flags = readBigEndianInt(headerStream);

        // ignore flags about the compression environment
        flags &= ~LZOP_FLAG_IO_MASK;
        flags &= ~LZOP_FLAG_OPERATING_SYSTEM_MASK;
        flags &= ~LZOP_FLAG_CHARACTER_SET_MASK;

        // checksum flags
        adler32Decompressed = (flags & LZOP_FLAG_ADLER32_DECOMPRESSED) != 0;
        adler32Compressed = (flags & LZOP_FLAG_ADLER32_COMPRESSED) != 0;
        crc32Decompressed = (flags & LZOP_FLAG_CRC32_DECOMPRESSED) != 0;
        crc32Compressed = (flags & LZOP_FLAG_CRC32_COMPRESSED) != 0;
        boolean crc32Header = (flags & LZOP_FLAG_CRC32_HEADER) != 0;

        flags &= ~LZOP_FLAG_ADLER32_DECOMPRESSED;
        flags &= ~LZOP_FLAG_ADLER32_COMPRESSED;
        flags &= ~LZOP_FLAG_CRC32_DECOMPRESSED;
        flags &= ~LZOP_FLAG_CRC32_COMPRESSED;
        flags &= ~LZOP_FLAG_CRC32_HEADER;

        // no other flags are supported
        if (flags != 0) {
            throw new IOException(format("Unsupported LZO flags 0x%08X", flags));
        }

        // output file mode: ignored
        readBigEndianInt(headerStream);

        // output file modified time: ignored
        readBigEndianInt(headerStream);

        // output file time zone offset: ignored
        readBigEndianInt(headerStream);

        // output file name: ignored
        int fileNameLength = headerStream.read();
        byte[] fileName = new byte[fileNameLength];
        readInput(fileName, 0, fileName.length);

        // verify header checksum
        int headerChecksumValue = readBigEndianInt(in);

        Checksum headerChecksum = crc32Header ? new CRC32() : new Adler32();
        headerChecksum.update(header, 0, header.length);
        headerChecksum.update(fileName, 0, fileName.length);
        if (headerChecksumValue != (int) headerChecksum.getValue()) {
            throw new IOException("Invalid header checksum");
        }
    }

    @Override
    public int read()
            throws IOException
    {
        if (finished) {
            return -1;
        }

        while (uncompressedOffset >= uncompressedLength) {
            int compressedLength = bufferCompressedData();
            if (finished) {
                return -1;
            }

            decompress(compressedLength, uncompressedChunk, 0, uncompressedChunk.length);
        }
        return uncompressedChunk[uncompressedOffset++] & 0xFF;
    }

    @Override
    public int read(byte[] output, int offset, int length)
            throws IOException
    {
        if (finished) {
            return -1;
        }

        while (uncompressedOffset >= uncompressedLength) {
            int compressedLength = bufferCompressedData();
            if (finished) {
                return -1;
            }

            // favor writing directly to user buffer to avoid extra copy
            if (length >= uncompressedLength) {
                decompress(compressedLength, output, offset, length);
                uncompressedOffset = uncompressedLength;
                return uncompressedLength;
            }

            decompress(compressedLength, uncompressedChunk, 0, uncompressedChunk.length);
        }
        int size = Math.min(length, uncompressedLength - uncompressedOffset);
        System.arraycopy(uncompressedChunk, uncompressedOffset, output, offset, size);
        uncompressedOffset += size;
        return size;
    }

    @Override
    public void resetState()
            throws IOException
    {
        uncompressedLength = 0;
        uncompressedOffset = 0;
        finished = false;
    }

    private int bufferCompressedData()
            throws IOException
    {
        uncompressedOffset = 0;
        uncompressedLength = readBigEndianInt(in);
        if (uncompressedLength == -1) {
            // LZOP file MUST end with uncompressedLength == 0
            throw new EOFException("encountered EOF while reading block data");
        }
        if (uncompressedLength == 0) {
            finished = true;
            return -1;
        }

        int compressedLength = readBigEndianInt(in);
        if (compressedLength == -1) {
            throw new EOFException("encountered EOF while reading block data");
        }

        skipChecksums(compressedLength < uncompressedLength);

        return compressedLength;
    }

    private void skipChecksums(boolean compressed)
            throws IOException
    {
        if (adler32Decompressed) {
            readBigEndianInt(in);
        }
        if (crc32Decompressed) {
            readBigEndianInt(in);
        }
        if (compressed && adler32Compressed) {
            readBigEndianInt(in);
        }
        if (compressed && crc32Compressed) {
            readBigEndianInt(in);
        }
    }

    private void decompress(int compressedLength, byte[] output, int outputOffset, int outputLength)
            throws IOException
    {
        if (uncompressedLength == compressedLength) {
            readInput(output, outputOffset, compressedLength);
        }
        else {
            if (compressed.length < compressedLength) {
                // over allocate buffer which makes decompression easier
                compressed = new byte[compressedLength + SIZE_OF_LONG];
            }
            readInput(compressed, 0, compressedLength);
            int actualUncompressedLength = decompressor.decompress(compressed, 0, compressedLength, output, outputOffset, outputLength);
            if (actualUncompressedLength != uncompressedLength) {
                throw new IOException("Decompressor did not decompress the entire block");
            }
        }
    }

    private void readInput(byte[] buffer, int offset, int length)
            throws IOException
    {
        while (length > 0) {
            int size = in.read(buffer, offset, length);
            if (size == -1) {
                throw new EOFException("encountered EOF while reading block data");
            }
            offset += size;
            length -= size;
        }
    }

    private static int readBigEndianShort(InputStream in)
            throws IOException
    {
        int b1 = in.read();
        if (b1 < 0) {
            return -1;
        }

        int b2 = in.read();
        // If second byte is negative, the stream it truncated
        if ((b2) < 0) {
            throw new IOException("Stream is truncated");
        }
        return (b1 << 8) + (b2);
    }

    private static int readBigEndianInt(InputStream in)
            throws IOException
    {
        int b1 = in.read();
        if (b1 < 0) {
            return -1;
        }
        int b2 = in.read();
        int b3 = in.read();
        int b4 = in.read();

        // If any of the other bits are negative, the stream it truncated
        if ((b2 | b3 | b4) < 0) {
            throw new IOException("Stream is truncated");
        }
        return ((b1 << 24) + (b2 << 16) + (b3 << 8) + (b4));
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy