All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.facebook.presto.orc.stream.OrcInputStream Maven / Gradle / Ivy

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.orc.stream;

import com.facebook.presto.orc.OrcCorruptionException;
import com.facebook.presto.orc.memory.AbstractAggregatedMemoryContext;
import com.facebook.presto.orc.memory.LocalMemoryContext;
import com.facebook.presto.orc.metadata.CompressionKind;
import com.google.common.base.MoreObjects;
import com.google.common.primitives.Ints;
import io.airlift.slice.FixedLengthSliceInput;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import org.iq80.snappy.Snappy;

import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;

import static com.facebook.presto.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint;
import static com.facebook.presto.orc.checkpoint.InputStreamCheckpoint.decodeCompressedBlockOffset;
import static com.facebook.presto.orc.checkpoint.InputStreamCheckpoint.decodeDecompressedOffset;
import static com.facebook.presto.orc.metadata.CompressionKind.SNAPPY;
import static com.facebook.presto.orc.metadata.CompressionKind.UNCOMPRESSED;
import static com.facebook.presto.orc.metadata.CompressionKind.ZLIB;
import static com.google.common.base.Preconditions.checkArgument;
import static io.airlift.slice.Slices.EMPTY_SLICE;
import static java.util.Objects.requireNonNull;
import static sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET;

public final class OrcInputStream
        extends InputStream
{
    public static final int EXPECTED_COMPRESSION_RATIO = 5;
    private final String source;
    private final FixedLengthSliceInput compressedSliceInput;
    private final CompressionKind compressionKind;
    private final int maxBufferSize;

    private int currentCompressedBlockOffset;
    private FixedLengthSliceInput current;

    private byte[] buffer;
    private final LocalMemoryContext bufferMemoryUsage;

    // When uncompressed,
    // * This tracks the memory usage of `current`.
    // When compressed,
    // * This tracks the memory usage of compressedSliceInput.
    // * Memory pointed to by `current` is always part of `buffer`. It shouldn't be counted again.
    private final LocalMemoryContext fixedMemoryUsage;

    public OrcInputStream(String source, FixedLengthSliceInput sliceInput, CompressionKind compressionKind, int bufferSize, AbstractAggregatedMemoryContext systemMemoryContext)
    {
        this.source = requireNonNull(source, "source is null");

        requireNonNull(sliceInput, "sliceInput is null");

        this.compressionKind = requireNonNull(compressionKind, "compressionKind is null");
        this.maxBufferSize = bufferSize;

        requireNonNull(systemMemoryContext, "systemMemoryContext is null");
        this.bufferMemoryUsage = systemMemoryContext.newLocalMemoryContext();
        this.fixedMemoryUsage = systemMemoryContext.newLocalMemoryContext();
        this.fixedMemoryUsage.setBytes(sliceInput.length());

        if (compressionKind == UNCOMPRESSED) {
            this.current = sliceInput;
            this.compressedSliceInput = EMPTY_SLICE.getInput();
        }
        else {
            checkArgument(compressionKind == SNAPPY || compressionKind == ZLIB, "%s compression not supported", compressionKind);
            this.compressedSliceInput = sliceInput;
            this.current = EMPTY_SLICE.getInput();
        }
    }

    @Override
    public void close()
            throws IOException
    {
        current = null;
        fixedMemoryUsage.setBytes(compressedSliceInput.length()); // see comments above for fixedMemoryUsage

        buffer = null;
        bufferMemoryUsage.setBytes(0);
    }

    @Override
    public int available()
            throws IOException
    {
        if (current == null) {
            return 0;
        }
        return current.available();
    }

    @Override
    public boolean markSupported()
    {
        return false;
    }

    @Override
    public int read()
            throws IOException
    {
        if (current == null) {
            return -1;
        }

        int result = current.read();
        if (result != -1) {
            return result;
        }

        advance();
        return read();
    }

    @Override
    public int read(byte[] b, int off, int length)
            throws IOException
    {
        if (current == null) {
            return -1;
        }

        if (current.remaining() == 0) {
            advance();
            if (current == null) {
                return -1;
            }
        }

        return current.read(b, off, length);
    }

    public long getCheckpoint()
    {
        // if the decompressed buffer is empty, return a checkpoint starting at the next block
        if (current == null || (current.position() == 0 && current.remaining() == 0)) {
            return createInputStreamCheckpoint(Ints.checkedCast(compressedSliceInput.position()), 0);
        }
        // otherwise return a checkpoint at the last compressed block read and the current position in the buffer
        return createInputStreamCheckpoint(currentCompressedBlockOffset, Ints.checkedCast(current.position()));
    }

    public boolean seekToCheckpoint(long checkpoint)
            throws IOException
    {
        int compressedBlockOffset = decodeCompressedBlockOffset(checkpoint);
        int decompressedOffset = decodeDecompressedOffset(checkpoint);
        boolean discardedBuffer;
        if (compressedBlockOffset != currentCompressedBlockOffset) {
            if (compressionKind == UNCOMPRESSED) {
                throw new OrcCorruptionException("Reset stream has a compressed block offset but stream is not compressed");
            }
            compressedSliceInput.setPosition(compressedBlockOffset);
            current = EMPTY_SLICE.getInput();
            discardedBuffer = true;
        }
        else {
            discardedBuffer = false;
        }

        if (decompressedOffset != current.position()) {
            current.setPosition(0);
            if (current.remaining() < decompressedOffset) {
                decompressedOffset -= current.remaining();
                advance();
            }
            current.setPosition(decompressedOffset);
        }
        return discardedBuffer;
    }

    @Override
    public long skip(long n)
            throws IOException
    {
        if (current == null || n <= 0) {
            return -1;
        }

        long result = current.skip(n);
        if (result != 0) {
            return result;
        }
        if (read() == -1) {
            return 0;
        }
        return 1 + current.skip(n - 1);
    }

    // This comes from the Apache Hive ORC code
    private void advance()
            throws IOException
    {
        if (compressedSliceInput == null || compressedSliceInput.remaining() == 0) {
            current = null;
            return;
        }

        // 3 byte header
        // NOTE: this must match BLOCK_HEADER_SIZE
        currentCompressedBlockOffset = Ints.checkedCast(compressedSliceInput.position());
        int b0 = compressedSliceInput.readUnsignedByte();
        int b1 = compressedSliceInput.readUnsignedByte();
        int b2 = compressedSliceInput.readUnsignedByte();

        boolean isUncompressed = (b0 & 0x01) == 1;
        int chunkLength = (b2 << 15) | (b1 << 7) | (b0 >>> 1);
        if (chunkLength < 0 || chunkLength > compressedSliceInput.remaining()) {
            throw new OrcCorruptionException(String.format("The chunkLength (%s) must not be negative or greater than remaining size (%s)", chunkLength, compressedSliceInput.remaining()));
        }

        Slice chunk = compressedSliceInput.readSlice(chunkLength);

        if (isUncompressed) {
            current = chunk.getInput();
        }
        else {
            int uncompressedSize;
            if (compressionKind == ZLIB) {
                uncompressedSize = decompressZip(chunk);
            }
            else {
                uncompressedSize = decompressSnappy(chunk);
            }

            current = Slices.wrappedBuffer(buffer, 0, uncompressedSize).getInput();
        }
    }

    @Override
    public String toString()
    {
        return MoreObjects.toStringHelper(this)
                .add("source", source)
                .add("compressedOffset", compressedSliceInput.position())
                .add("uncompressedOffset", current == null ? null : current.position())
                .add("compression", compressionKind)
                .toString();
    }

    // This comes from the Apache Hive ORC code
    private int decompressZip(Slice in)
            throws IOException
    {
        Inflater inflater = new Inflater(true);
        try {
            inflater.setInput((byte[]) in.getBase(), (int) (in.getAddress() - ARRAY_BYTE_BASE_OFFSET), in.length());
            allocateOrGrowBuffer(in.length() * EXPECTED_COMPRESSION_RATIO, false);
            int uncompressedLength = 0;
            while (true) {
                uncompressedLength += inflater.inflate(buffer, uncompressedLength, buffer.length - uncompressedLength);
                if (inflater.finished() || buffer.length >= maxBufferSize) {
                    break;
                }
                int oldBufferSize = buffer.length;
                allocateOrGrowBuffer(buffer.length * 2, true);
                if (buffer.length <= oldBufferSize) {
                    throw new IllegalStateException(String.format("Buffer failed to grow. Old size %d, current size %d", oldBufferSize, buffer.length));
                }
            }

            if (!inflater.finished()) {
                throw new OrcCorruptionException("Could not decompress all input (output buffer too small?)");
            }

            return uncompressedLength;
        }
        catch (DataFormatException e) {
            throw new OrcCorruptionException(e, "Invalid compressed stream");
        }
        finally {
            inflater.end();
        }
    }

    private int decompressSnappy(Slice in)
            throws IOException
    {
        byte[] inArray = (byte[]) in.getBase();
        int inOffset = (int) (in.getAddress() - ARRAY_BYTE_BASE_OFFSET);
        int inLength = in.length();

        int uncompressedLength = Snappy.getUncompressedLength(inArray, inOffset);
        checkArgument(uncompressedLength <= maxBufferSize, "Snappy requires buffer (%s) larger than max size (%s)", uncompressedLength, maxBufferSize);
        allocateOrGrowBuffer(uncompressedLength, false);

        return Snappy.uncompress(inArray, inOffset, inLength, buffer, 0);
    }

    private void allocateOrGrowBuffer(int size, boolean copyExistingData)
    {
        if (buffer == null || buffer.length < size) {
            if (copyExistingData && buffer != null) {
                buffer = Arrays.copyOfRange(buffer, 0, Math.min(size, maxBufferSize));
            }
            else {
                buffer = new byte[Math.min(size, maxBufferSize)];
            }
        }
        bufferMemoryUsage.setBytes(buffer.length);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy