com.facebook.hive.orc.OutStream Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-dwrf Show documentation
Show all versions of hive-dwrf Show documentation
DWRF file format for Hive
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.hive.orc;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.Map;
import com.facebook.hive.orc.compression.CompressionCodec;
import org.apache.hadoop.hive.serde2.ReaderWriterProfiler;
class OutStream extends PositionedOutputStream {
interface OutputReceiver {
/**
* Output the given buffer to the final destination
* @param buffer the buffer to output
* @throws IOException
*/
void output(ByteBuffer buffer) throws IOException;
}
static final int HEADER_SIZE = 3;
// If the OutStream is flushing the last compressed ByteBuffer, we don't need to reallocate
// a new one, we can just share the same one over and over between streams because it gets
// cleared right away. It is a mapping from the size of the buffer to the buffer.
private static final Map COMPRESSED_FOR_REUSE =
new HashMap();
private final String name;
private final OutputReceiver receiver;
/**
* Stores the uncompressed bytes that have been serialized, but not
* compressed yet. When this fills, we compress the entire buffer.
*/
private ByteBuffer current = null;
/**
* Stores the compressed bytes until we have a full buffer and then outputs
* them to the receiver. If no compression is being done, this (and overflow)
* will always be null and the current buffer will be sent directly to the
* receiver.
*/
private ByteBuffer compressed = null;
/**
* Since the compressed buffer may start with contents from previous
* compression blocks, we allocate an overflow buffer so that the
* output of the codec can be split between the two buffers. After the
* compressed buffer is sent to the receiver, the overflow buffer becomes
* the new compressed buffer.
*/
private ByteBuffer overflow = null;
private final int bufferSize;
private final CompressionCodec codec;
private long compressedBytes = 0;
private long uncompressedBytes = 0;
private final MemoryEstimate memoryEstimate;
OutStream(String name,
int bufferSize,
CompressionCodec codec,
OutputReceiver receiver,
MemoryEstimate memoryEstimate) throws IOException {
this.name = name;
this.bufferSize = bufferSize;
this.codec = codec;
this.receiver = receiver;
this.suppress = false;
this.memoryEstimate = memoryEstimate;
if (!COMPRESSED_FOR_REUSE.containsKey(bufferSize + HEADER_SIZE)) {
COMPRESSED_FOR_REUSE.put(bufferSize + HEADER_SIZE,
ByteBuffer.allocate(bufferSize + HEADER_SIZE));
}
}
public void clear() throws IOException {
uncompressedBytes = 0;
compressedBytes = 0;
if (overflow != null) {
memoryEstimate.decrementTotalMemory(overflow.capacity());
overflow = null;
}
if (current != null) {
memoryEstimate.decrementTotalMemory(current.capacity());
current = null;
}
if (compressed != null) {
memoryEstimate.decrementTotalMemory(compressed.capacity());
compressed = null;
}
suppress = false;
}
/**
* Write the length of the compressed bytes. Life is much easier if the
* header is constant length, so just use 3 bytes. Considering most of the
* codecs want between 32k (snappy) and 256k (lzo, zlib), 3 bytes should
* be plenty. We also use the low bit for whether it is the original or
* compressed bytes.
* @param buffer the buffer to write the header to
* @param position the position in the buffer to write at
* @param val the size in the file
* @param original is it uncompressed
*/
private static void writeHeader(ByteBuffer buffer,
int position,
int val,
boolean original) {
buffer.put(position, (byte) ((val << 1) + (original ? 1 : 0)));
buffer.put(position + 1, (byte) (val >> 7));
buffer.put(position + 2, (byte) (val >> 15));
}
private void getNewInputBuffer() throws IOException {
if (codec == null) {
current = ByteBuffer.allocate(bufferSize);
} else {
current = ByteBuffer.allocate(bufferSize + HEADER_SIZE);
writeHeader(current, 0, bufferSize, true);
current.position(HEADER_SIZE);
}
memoryEstimate.incrementTotalMemory(current.capacity());
}
/**
* Allocate a new output buffer if we are compressing.
*/
private ByteBuffer getNewOutputBuffer() throws IOException {
return ByteBuffer.allocate(bufferSize + HEADER_SIZE);
}
private void flip() throws IOException {
current.limit(current.position());
current.position(codec == null ? 0 : HEADER_SIZE);
}
@Override
public void write(int i) throws IOException {
if (current == null) {
getNewInputBuffer();
}
if (current.remaining() < 1) {
spill(false);
}
uncompressedBytes += 1;
current.put((byte) i);
}
@Override
public void write(byte[] bytes, int offset, int length) throws IOException {
if (current == null) {
getNewInputBuffer();
}
int remaining = Math.min(current.remaining(), length);
current.put(bytes, offset, remaining);
uncompressedBytes += remaining;
length -= remaining;
while (length != 0) {
spill(false);
offset += remaining;
remaining = Math.min(current.remaining(), length);
current.put(bytes, offset, remaining);
uncompressedBytes += remaining;
length -= remaining;
}
}
private void spill(boolean reuseBuffer) throws java.io.IOException {
ReaderWriterProfiler.start(ReaderWriterProfiler.Counter.COMPRESSION_TIME);
// if there isn't anything in the current buffer, don't spill
if (current == null || current.position() == (codec == null ? 0 : HEADER_SIZE)) {
ReaderWriterProfiler.end(ReaderWriterProfiler.Counter.COMPRESSION_TIME);
return;
}
flip();
if (codec == null) {
receiver.output(current);
getNewInputBuffer();
} else {
if (compressed == null) {
if (reuseBuffer) {
compressed = COMPRESSED_FOR_REUSE.get(bufferSize + HEADER_SIZE);
compressed.clear();
} else {
compressed = getNewOutputBuffer();
}
memoryEstimate.incrementTotalMemory(compressed.capacity());
} else if (overflow == null) {
overflow = getNewOutputBuffer();
memoryEstimate.incrementTotalMemory(overflow.capacity());
}
int sizePosn = compressed.position();
compressed.position(compressed.position() + HEADER_SIZE);
if (codec.compress(current, compressed, overflow)) {
uncompressedBytes = 0;
// move position back to after the header
current.position(HEADER_SIZE);
current.limit(current.capacity());
// find the total bytes in the chunk
int totalBytes = compressed.position() - sizePosn - HEADER_SIZE;
if (overflow != null) {
totalBytes += overflow.position();
}
compressedBytes += totalBytes + HEADER_SIZE;
writeHeader(compressed, sizePosn, totalBytes, false);
// if we have less than the next header left, spill it.
if (compressed.remaining() < HEADER_SIZE) {
compressed.flip();
receiver.output(compressed);
compressed = overflow;
overflow = null;
}
} else {
compressedBytes += uncompressedBytes + HEADER_SIZE;
uncompressedBytes = 0;
// we are using the original, but need to spill the current
// compressed buffer first. So back up to where we started,
// flip it and add it to done.
if (sizePosn != 0) {
compressed.position(sizePosn);
compressed.flip();
receiver.output(compressed);
compressed = null;
// if we have an overflow, clear it and make it the new compress
// buffer
if (overflow != null) {
overflow.clear();
compressed = overflow;
overflow = null;
}
} else {
compressed.clear();
if (overflow != null) {
overflow.clear();
}
}
// now add the current buffer into the done list and get a new one.
current.position(0);
// update the header with the current length
writeHeader(current, 0, current.limit() - HEADER_SIZE, true);
receiver.output(current);
getNewInputBuffer();
}
}
ReaderWriterProfiler.end(ReaderWriterProfiler.Counter.COMPRESSION_TIME);
}
@Override
void getPosition(PositionRecorder recorder) throws IOException {
if (codec == null) {
recorder.addPosition(uncompressedBytes);
} else {
recorder.addPosition(compressedBytes);
recorder.addPosition(uncompressedBytes);
}
}
@Override
public void flush() throws IOException {
flush(false);
}
@Override
/**
* @param reuseBuffer If this is set to true, the compressed data will be flushed to a reusable
* buffer, make sure to flush the buffers to disk before flushing any other
* stream.
*/
public void flush(boolean reuseBuffer) throws IOException {
spill(reuseBuffer);
if (compressed != null && compressed.position() != 0) {
compressed.flip();
receiver.output(compressed);
compressed = null;
}
clear();
}
@Override
public String toString() {
return name;
}
@Override
public long getBufferSize() {
long result = 0;
if (current != null) {
result += current.capacity();
}
if (compressed != null) {
result += compressed.capacity();
}
if (overflow != null) {
result += overflow.capacity();
}
return result;
}
/**
* Returns the state of suppress flag
* @return value of suppress flag
*/
public boolean isSuppressed() {
return suppress;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy