org.apache.orc.impl.InStream Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-apache Show documentation
Show all versions of hive-apache Show documentation
Shaded version of Apache Hive for Trino
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.orc.impl;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import org.apache.orc.CompressionCodec;
import io.trino.hive.$internal.org.slf4j.Logger;
import io.trino.hive.$internal.org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.common.io.DiskRange;
import io.trino.hive.$internal.com.google.protobuf.CodedInputStream;
public abstract class InStream extends InputStream {
private static final Logger LOG = LoggerFactory.getLogger(InStream.class);
public static final int PROTOBUF_MESSAGE_MAX_LIMIT = 1024 << 20; // 1GB
protected final String name;
protected long length;
public InStream(String name, long length) {
this.name = name;
this.length = length;
}
public String getStreamName() {
return name;
}
public long getStreamLength() {
return length;
}
@Override
public abstract void close();
public static class UncompressedStream extends InStream {
private List bytes;
private long length;
protected long currentOffset;
private ByteBuffer range;
private int currentRange;
public UncompressedStream(String name, List input, long length) {
super(name, length);
reset(input, length);
}
protected void reset(List input, long length) {
this.bytes = input;
this.length = length;
currentRange = 0;
currentOffset = 0;
range = null;
}
@Override
public int read() {
if (range == null || range.remaining() == 0) {
if (currentOffset == length) {
return -1;
}
seek(currentOffset);
}
currentOffset += 1;
return 0xff & range.get();
}
@Override
public int read(byte[] data, int offset, int length) {
if (range == null || range.remaining() == 0) {
if (currentOffset == this.length) {
return -1;
}
seek(currentOffset);
}
int actualLength = Math.min(length, range.remaining());
range.get(data, offset, actualLength);
currentOffset += actualLength;
return actualLength;
}
@Override
public int available() {
if (range != null && range.remaining() > 0) {
return range.remaining();
}
return (int) (length - currentOffset);
}
@Override
public void close() {
currentRange = bytes.size();
currentOffset = length;
// explicit de-ref of bytes[]
bytes.clear();
}
@Override
public void seek(PositionProvider index) throws IOException {
seek(index.getNext());
}
public void seek(long desired) {
if (desired == 0 && bytes.isEmpty()) {
return;
}
int i = 0;
for (DiskRange curRange : bytes) {
if (curRange.getOffset() <= desired &&
(desired - curRange.getOffset()) < curRange.getLength()) {
currentOffset = desired;
currentRange = i;
this.range = curRange.getData().duplicate();
int pos = range.position();
pos += (int)(desired - curRange.getOffset()); // this is why we duplicate
this.range.position(pos);
return;
}
++i;
}
// if they are seeking to the precise end, go ahead and let them go there
int segments = bytes.size();
if (segments != 0 && desired == bytes.get(segments - 1).getEnd()) {
currentOffset = desired;
currentRange = segments - 1;
DiskRange curRange = bytes.get(currentRange);
this.range = curRange.getData().duplicate();
int pos = range.position();
pos += (int)(desired - curRange.getOffset()); // this is why we duplicate
this.range.position(pos);
return;
}
throw new IllegalArgumentException("Seek in " + name + " to " +
desired + " is outside of the data");
}
@Override
public String toString() {
return "uncompressed stream " + name + " position: " + currentOffset +
" length: " + length + " range: " + currentRange +
" offset: " + (range == null ? 0 : range.position()) + " limit: " + (range == null ? 0 : range.limit());
}
}
private static ByteBuffer allocateBuffer(int size, boolean isDirect) {
// TODO: use the same pool as the ORC readers
if (isDirect) {
return ByteBuffer.allocateDirect(size);
} else {
return ByteBuffer.allocate(size);
}
}
private static class CompressedStream extends InStream {
private final List bytes;
private final int bufferSize;
private ByteBuffer uncompressed;
private final CompressionCodec codec;
private ByteBuffer compressed;
private long currentOffset;
private int currentRange;
private boolean isUncompressedOriginal;
public CompressedStream(String name, List input, long length,
CompressionCodec codec, int bufferSize) {
super(name, length);
this.bytes = input;
this.codec = codec;
this.bufferSize = bufferSize;
currentOffset = 0;
currentRange = 0;
}
private void allocateForUncompressed(int size, boolean isDirect) {
uncompressed = allocateBuffer(size, isDirect);
}
private void readHeader() throws IOException {
if (compressed == null || compressed.remaining() <= 0) {
seek(currentOffset);
}
if (compressed.remaining() > OutStream.HEADER_SIZE) {
int b0 = compressed.get() & 0xff;
int b1 = compressed.get() & 0xff;
int b2 = compressed.get() & 0xff;
boolean isOriginal = (b0 & 0x01) == 1;
int chunkLength = (b2 << 15) | (b1 << 7) | (b0 >> 1);
if (chunkLength > bufferSize) {
throw new IllegalArgumentException("Buffer size too small. size = " +
bufferSize + " needed = " + chunkLength);
}
// read 3 bytes, which should be equal to OutStream.HEADER_SIZE always
assert OutStream.HEADER_SIZE == 3 : "The Orc HEADER_SIZE must be the same in OutStream and InStream";
currentOffset += OutStream.HEADER_SIZE;
ByteBuffer slice = this.slice(chunkLength);
if (isOriginal) {
uncompressed = slice;
isUncompressedOriginal = true;
} else {
if (isUncompressedOriginal) {
allocateForUncompressed(bufferSize, slice.isDirect());
isUncompressedOriginal = false;
} else if (uncompressed == null) {
allocateForUncompressed(bufferSize, slice.isDirect());
} else {
uncompressed.clear();
}
codec.decompress(slice, uncompressed);
}
} else {
throw new IllegalStateException("Can't read header at " + this);
}
}
@Override
public int read() throws IOException {
if (!ensureUncompressed()) {
return -1;
}
return 0xff & uncompressed.get();
}
@Override
public int read(byte[] data, int offset, int length) throws IOException {
if (!ensureUncompressed()) {
return -1;
}
int actualLength = Math.min(length, uncompressed.remaining());
uncompressed.get(data, offset, actualLength);
return actualLength;
}
private boolean ensureUncompressed() throws IOException {
while (uncompressed == null || uncompressed.remaining() == 0) {
if (currentOffset == this.length) {
return false;
}
readHeader();
}
return true;
}
@Override
public int available() throws IOException {
if (!ensureUncompressed()) {
return 0;
}
return uncompressed.remaining();
}
@Override
public void close() {
uncompressed = null;
compressed = null;
currentRange = bytes.size();
currentOffset = length;
bytes.clear();
}
@Override
public void seek(PositionProvider index) throws IOException {
seek(index.getNext());
long uncompressedBytes = index.getNext();
if (uncompressedBytes != 0) {
readHeader();
uncompressed.position(uncompressed.position() +
(int) uncompressedBytes);
} else if (uncompressed != null) {
// mark the uncompressed buffer as done
uncompressed.position(uncompressed.limit());
}
}
/* slices a read only contiguous buffer of chunkLength */
private ByteBuffer slice(int chunkLength) throws IOException {
int len = chunkLength;
final long oldOffset = currentOffset;
ByteBuffer slice;
if (compressed.remaining() >= len) {
slice = compressed.slice();
// simple case
slice.limit(len);
currentOffset += len;
compressed.position(compressed.position() + len);
return slice;
} else if (currentRange >= (bytes.size() - 1)) {
// nothing has been modified yet
throw new IOException("EOF in " + this + " while trying to read " +
chunkLength + " bytes");
}
if (LOG.isDebugEnabled()) {
LOG.debug(String.format(
"Crossing into next BufferChunk because compressed only has %d bytes (needs %d)",
compressed.remaining(), len));
}
// we need to consolidate 2 or more buffers into 1
// first copy out compressed buffers
ByteBuffer copy = allocateBuffer(chunkLength, compressed.isDirect());
currentOffset += compressed.remaining();
len -= compressed.remaining();
copy.put(compressed);
ListIterator iter = bytes.listIterator(currentRange);
while (len > 0 && iter.hasNext()) {
++currentRange;
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Read slow-path, >1 cross block reads with %s", this.toString()));
}
DiskRange range = iter.next();
compressed = range.getData().duplicate();
if (compressed.remaining() >= len) {
slice = compressed.slice();
slice.limit(len);
copy.put(slice);
currentOffset += len;
compressed.position(compressed.position() + len);
return copy;
}
currentOffset += compressed.remaining();
len -= compressed.remaining();
copy.put(compressed);
}
// restore offsets for exception clarity
seek(oldOffset);
throw new IOException("EOF in " + this + " while trying to read " +
chunkLength + " bytes");
}
private void seek(long desired) throws IOException {
if (desired == 0 && bytes.isEmpty()) {
return;
}
int i = 0;
for (DiskRange range : bytes) {
if (range.getOffset() <= desired && desired < range.getEnd()) {
currentRange = i;
compressed = range.getData().duplicate();
int pos = compressed.position();
pos += (int)(desired - range.getOffset());
compressed.position(pos);
currentOffset = desired;
return;
}
++i;
}
// if they are seeking to the precise end, go ahead and let them go there
int segments = bytes.size();
if (segments != 0 && desired == bytes.get(segments - 1).getEnd()) {
DiskRange range = bytes.get(segments - 1);
currentRange = segments - 1;
compressed = range.getData().duplicate();
compressed.position(compressed.limit());
currentOffset = desired;
return;
}
throw new IOException("Seek outside of data in " + this + " to " + desired);
}
private String rangeString() {
StringBuilder builder = new StringBuilder();
int i = 0;
for (DiskRange range : bytes) {
if (i != 0) {
builder.append("; ");
}
builder.append(" range " + i + " = " + range.getOffset()
+ " to " + (range.getEnd() - range.getOffset()));
++i;
}
return builder.toString();
}
@Override
public String toString() {
return "compressed stream " + name + " position: " + currentOffset +
" length: " + length + " range: " + currentRange +
" offset: " + (compressed == null ? 0 : compressed.position()) + " limit: " + (compressed == null ? 0 : compressed.limit()) +
rangeString() +
(uncompressed == null ? "" :
" uncompressed: " + uncompressed.position() + " to " +
uncompressed.limit());
}
}
public abstract void seek(PositionProvider index) throws IOException;
/**
* Create an input stream from a list of buffers.
* @param streamName the name of the stream
* @param buffers the list of ranges of bytes for the stream
* @param offsets a list of offsets (the same length as input) that must
* contain the first offset of the each set of bytes in input
* @param length the length in bytes of the stream
* @param codec the compression codec
* @param bufferSize the compression buffer size
* @return an input stream
* @throws IOException
*/
//@VisibleForTesting
@Deprecated
public static InStream create(String streamName,
ByteBuffer[] buffers,
long[] offsets,
long length,
CompressionCodec codec,
int bufferSize) throws IOException {
List input = new ArrayList(buffers.length);
for (int i = 0; i < buffers.length; ++i) {
input.add(new BufferChunk(buffers[i], offsets[i]));
}
return create(streamName, input, length, codec, bufferSize);
}
/**
* Create an input stream from a list of disk ranges with data.
* @param name the name of the stream
* @param input the list of ranges of bytes for the stream; from disk or cache
* @param length the length in bytes of the stream
* @param codec the compression codec
* @param bufferSize the compression buffer size
* @return an input stream
* @throws IOException
*/
public static InStream create(String name,
List input,
long length,
CompressionCodec codec,
int bufferSize) throws IOException {
if (codec == null) {
return new UncompressedStream(name, input, length);
} else {
return new CompressedStream(name, input, length, codec, bufferSize);
}
}
/**
* Creates coded input stream (used for protobuf message parsing) with higher message size limit.
*
* @param name the name of the stream
* @param input the list of ranges of bytes for the stream; from disk or cache
* @param length the length in bytes of the stream
* @param codec the compression codec
* @param bufferSize the compression buffer size
* @return coded input stream
* @throws IOException
*/
public static CodedInputStream createCodedInputStream(
String name,
List input,
long length,
CompressionCodec codec,
int bufferSize) throws IOException {
InStream inStream = create(name, input, length, codec, bufferSize);
CodedInputStream codedInputStream = CodedInputStream.newInstance(inStream);
codedInputStream.setSizeLimit(PROTOBUF_MESSAGE_MAX_LIMIT);
return codedInputStream;
}
}