All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.bytes.BytesInput Maven / Gradle / Ivy

There is a newer version: 1.15.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.bytes;

import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.util.Arrays;
import java.util.List;
import java.util.function.Consumer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A source of bytes capable of writing itself to an output.
 * A BytesInput should be consumed right away.
 * It is not a container.
 * For example if it is referring to a stream,
 * subsequent BytesInput reads from the stream will be incorrect
 * if the previous has not been consumed.
 */
public abstract class BytesInput {
  private static final Logger LOG = LoggerFactory.getLogger(BytesInput.class);
  private static final EmptyBytesInput EMPTY_BYTES_INPUT = new EmptyBytesInput();

  /**
   * logically concatenate the provided inputs
   *
   * @param inputs the inputs to concatenate
   * @return a concatenated input
   */
  public static BytesInput concat(BytesInput... inputs) {
    return new SequenceBytesIn(Arrays.asList(inputs));
  }

  /**
   * logically concatenate the provided inputs
   *
   * @param inputs the inputs to concatenate
   * @return a concatenated input
   */
  public static BytesInput concat(List inputs) {
    return new SequenceBytesIn(inputs);
  }

  /**
   * @param in    an input stream
   * @param bytes number of bytes to read
   * @return a BytesInput that will read that number of bytes from the stream
   */
  public static BytesInput from(InputStream in, int bytes) {
    return new StreamBytesInput(in, bytes);
  }

  /**
   * @param buffer
   * @param length number of bytes to read
   * @return a BytesInput that will read the given bytes from the ByteBuffer
   * @deprecated Will be removed in 2.0.0
   */
  @Deprecated
  public static BytesInput from(ByteBuffer buffer, int offset, int length) {
    ByteBuffer tmp = buffer.duplicate();
    tmp.position(offset);
    ByteBuffer slice = tmp.slice();
    slice.limit(length);
    return new ByteBufferBytesInput(slice);
  }

  /**
   * @param buffers an array of byte buffers
   * @return a BytesInput that will read the given bytes from the ByteBuffers
   */
  public static BytesInput from(ByteBuffer... buffers) {
    if (buffers.length == 1) {
      return new ByteBufferBytesInput(buffers[0]);
    }
    return new BufferListBytesInput(Arrays.asList(buffers));
  }

  /**
   * @param buffers a list of byte buffers
   * @return a BytesInput that will read the given bytes from the ByteBuffers
   */
  public static BytesInput from(List buffers) {
    if (buffers.size() == 1) {
      return new ByteBufferBytesInput(buffers.get(0));
    }
    return new BufferListBytesInput(buffers);
  }

  /**
   * @param in a byte array
   * @return a Bytes input that will write the given bytes
   */
  public static BytesInput from(byte[] in) {
    LOG.debug("BytesInput from array of {} bytes", in.length);
    return new ByteArrayBytesInput(in, 0, in.length);
  }

  public static BytesInput from(byte[] in, int offset, int length) {
    LOG.debug("BytesInput from array of {} bytes", length);
    return new ByteArrayBytesInput(in, offset, length);
  }

  /**
   * @param intValue the int to write
   * @return a BytesInput that will write 4 bytes in little endian
   */
  public static BytesInput fromInt(int intValue) {
    return new IntBytesInput(intValue);
  }

  /**
   * @param intValue the int to write
   * @return a BytesInput that will write var int
   */
  public static BytesInput fromUnsignedVarInt(int intValue) {
    return new UnsignedVarIntBytesInput(intValue);
  }

  /**
   * @param intValue the int to write
   * @return a ByteInput that contains the int value as a variable-length zig-zag encoded int
   */
  public static BytesInput fromZigZagVarInt(int intValue) {
    int zigZag = (intValue << 1) ^ (intValue >> 31);
    return new UnsignedVarIntBytesInput(zigZag);
  }

  /**
   * @param longValue the long to write
   * @return a BytesInput that will write var long
   */
  public static BytesInput fromUnsignedVarLong(long longValue) {
    return new UnsignedVarLongBytesInput(longValue);
  }

  /**
   * @param longValue the long to write
   * @return a ByteInput that contains the long value as a variable-length zig-zag encoded long
   */
  public static BytesInput fromZigZagVarLong(long longValue) {
    long zigZag = (longValue << 1) ^ (longValue >> 63);
    return new UnsignedVarLongBytesInput(zigZag);
  }

  /**
   * @param arrayOut a capacity byte array output stream to wrap into a BytesInput
   * @return a BytesInput that will write the content of the buffer
   */
  public static BytesInput from(CapacityByteArrayOutputStream arrayOut) {
    return new CapacityBAOSBytesInput(arrayOut);
  }

  /**
   * @param baos - stream to wrap into a BytesInput
   * @return a BytesInput that will write the content of the buffer
   */
  public static BytesInput from(ByteArrayOutputStream baos) {
    return new BAOSBytesInput(baos);
  }

  /**
   * @return an empty bytes input
   */
  public static BytesInput empty() {
    return EMPTY_BYTES_INPUT;
  }

  /**
   * copies the input into a new byte array
   *
   * @param bytesInput a BytesInput
   * @return a copy of the BytesInput
   * @throws IOException if there is an exception when reading bytes from the BytesInput
   * @deprecated Use {@link #copy(ByteBufferAllocator, Consumer)} instead
   */
  @Deprecated
  public static BytesInput copy(BytesInput bytesInput) throws IOException {
    return from(bytesInput.toByteArray());
  }

  /**
   * writes the bytes into a stream
   *
   * @param out an output stream
   * @throws IOException if there is an exception writing
   */
  public abstract void writeAllTo(OutputStream out) throws IOException;

  /**
   * For internal use only. It is expected that the buffer is large enough to fit the content of this {@link BytesInput}
   * object.
   */
  abstract void writeInto(ByteBuffer buffer);

  /**
   * @return a new byte array materializing the contents of this input
   * @throws IOException if there is an exception reading
   * @deprecated Use {@link #toByteBuffer(ByteBufferAllocator, Consumer)}
   */
  @Deprecated
  public byte[] toByteArray() throws IOException {
    long size = size();
    if (size > Integer.MAX_VALUE) {
      throw new IOException("Page size, " + size + ", is larger than allowed " + Integer.MAX_VALUE + "."
          + " Usually caused by a Parquet writer writing too big column chunks on encountering highly skewed dataset."
          + " Please set page.size.row.check.max to a lower value on the writer, default value is 10000."
          + " You can try setting it to "
          + (10000 / (size / Integer.MAX_VALUE)) + " or lower.");
    }
    BAOS baos = new BAOS((int) size());
    this.writeAllTo(baos);
    LOG.debug("converted {} to byteArray of {} bytes", size(), baos.size());
    return baos.getBuf();
  }

  /**
   * @return a new ByteBuffer materializing the contents of this input
   * @throws IOException if there is an exception reading
   * @deprecated Use {@link #toByteBuffer(ByteBufferAllocator, Consumer)}
   */
  @Deprecated
  public ByteBuffer toByteBuffer() throws IOException {
    return ByteBuffer.wrap(toByteArray());
  }

  /**
   * Copies the content of this {@link BytesInput} object to a newly created {@link ByteBuffer} and returns it wrapped
   * in a {@link BytesInput} object.
   *
   * The data content shall be able to be fit in a {@link ByteBuffer} object! (In case of the size of
   * this {@link BytesInput} object cannot fit in an {@code int}, an {@link ArithmeticException} will be thrown. The
   * {@code allocator} might throw an {@link OutOfMemoryError} if it is unable to allocate the required
   * {@link ByteBuffer}.)
   *
   * @param allocator the allocator to be used for creating the new {@link ByteBuffer} object
   * @param callback  the callback called with the newly created {@link ByteBuffer} object; to be used for make it
   *                  released at the proper time
   * @return the newly created {@link BytesInput} object wrapping the copied content of the specified one
   */
  public BytesInput copy(ByteBufferAllocator allocator, Consumer callback) {
    ByteBuffer buf = allocator.allocate(Math.toIntExact(size()));
    callback.accept(buf);
    writeInto(buf);
    buf.flip();
    return BytesInput.from(buf);
  }

  /**
   * Similar to {@link #copy(ByteBufferAllocator, Consumer)} where the allocator and the callback are in the specified
   * {@link ByteBufferReleaser}.
   */
  public BytesInput copy(ByteBufferReleaser releaser) {
    return copy(releaser.allocator, releaser::releaseLater);
  }

  /**
   * Returns a {@link ByteBuffer} object referencing the data behind this {@link BytesInput} object. It may create a new
   * {@link ByteBuffer} object if this {@link BytesInput} is not backed by a single {@link ByteBuffer}. In the latter
   * case the specified {@link ByteBufferAllocator} object will be used. In case of allocation the specified callback
   * will be invoked so the release of the newly allocated {@link ByteBuffer} object can be released at a proper time.
   *
   * The data content shall be able to be fit in a {@link ByteBuffer} object! (In case of the size of
   * this {@link BytesInput} object cannot fit in an {@code int}, an {@link ArithmeticException} will be thrown. The
   * {@code allocator} might throw an {@link OutOfMemoryError} if it is unable to allocate the required
   * {@link ByteBuffer}.)
   *
   * @param allocator the {@link ByteBufferAllocator} to be used for potentially allocating a new {@link ByteBuffer}
   *                  object
   * @param callback  the callback to be called with the new {@link ByteBuffer} object potentially allocated
   * @return the {@link ByteBuffer} object with the data content of this {@link BytesInput} object. (Might be a copy of
   * the content or directly referencing the same memory as this {@link BytesInput} object.)
   */
  public ByteBuffer toByteBuffer(ByteBufferAllocator allocator, Consumer callback) {
    ByteBuffer buf = getInternalByteBuffer();
    // The internal buffer should be direct iff the allocator is direct as well but let's be sure
    if (buf == null || buf.isDirect() != allocator.isDirect()) {
      buf = allocator.allocate(Math.toIntExact(size()));
      callback.accept(buf);
      writeInto(buf);
      buf.flip();
    }
    return buf;
  }

  /**
   * Similar to {@link #toByteBuffer(ByteBufferAllocator, Consumer)} where the allocator and the callback are in the
   * specified {@link ByteBufferReleaser}.
   */
  public ByteBuffer toByteBuffer(ByteBufferReleaser releaser) {
    return toByteBuffer(releaser.allocator, releaser::releaseLater);
  }

  /**
   * For internal use only.
   * 

* Returns a {@link ByteBuffer} object referencing to the internal data of this {@link BytesInput} without copying if * applicable. If it is not possible (because there are multiple {@link ByteBuffer}s internally or cannot be * referenced as a {@link ByteBuffer}), {@code null} value will be returned. * * @return the internal data of this {@link BytesInput} or {@code null} */ ByteBuffer getInternalByteBuffer() { return null; } /** * @return a new InputStream materializing the contents of this input * @throws IOException if there is an exception reading */ public ByteBufferInputStream toInputStream() throws IOException { return ByteBufferInputStream.wrap(toByteBuffer()); } /** * @return the size in bytes that would be written */ public abstract long size(); private static final class BAOS extends ByteArrayOutputStream { private BAOS(int size) { super(size); } public byte[] getBuf() { return this.buf; } } private static class StreamBytesInput extends BytesInput { private static final Logger LOG = LoggerFactory.getLogger(BytesInput.StreamBytesInput.class); private final InputStream in; private final int byteCount; private StreamBytesInput(InputStream in, int byteCount) { super(); this.in = in; this.byteCount = byteCount; } @Override public void writeAllTo(OutputStream out) throws IOException { LOG.debug("write All {} bytes", byteCount); // TODO: more efficient out.write(this.toByteArray()); } @Override void writeInto(ByteBuffer buffer) { try { // Needs a duplicate buffer to set the correct limit (we do not want to over-read the stream) ByteBuffer workBuf = buffer.duplicate(); int pos = buffer.position(); workBuf.limit(pos + byteCount); Channels.newChannel(in).read(workBuf); buffer.position(pos + byteCount); } catch (IOException e) { new RuntimeException("Exception occurred during reading input stream", e); } } public byte[] toByteArray() throws IOException { LOG.debug("read all {} bytes", byteCount); byte[] buf = new byte[byteCount]; new DataInputStream(in).readFully(buf); return buf; } @Override public long size() { return byteCount; } } private static class SequenceBytesIn extends BytesInput { private static final Logger LOG = LoggerFactory.getLogger(BytesInput.SequenceBytesIn.class); private final List inputs; private final long size; private SequenceBytesIn(List inputs) { this.inputs = inputs; long total = 0; for (BytesInput input : inputs) { total += input.size(); } this.size = total; } @SuppressWarnings("unused") @Override public void writeAllTo(OutputStream out) throws IOException { for (BytesInput input : inputs) { LOG.debug("write {} bytes to out", input.size()); if (input instanceof SequenceBytesIn) LOG.debug("{"); input.writeAllTo(out); if (input instanceof SequenceBytesIn) LOG.debug("}"); } } @Override void writeInto(ByteBuffer buffer) { for (BytesInput input : inputs) { input.writeInto(buffer); } } @Override ByteBuffer getInternalByteBuffer() { return inputs.size() == 1 ? inputs.get(0).getInternalByteBuffer() : null; } @Override public long size() { return size; } } private static class IntBytesInput extends BytesInput { private final int intValue; public IntBytesInput(int intValue) { this.intValue = intValue; } @Override public void writeAllTo(OutputStream out) throws IOException { BytesUtils.writeIntLittleEndian(out, intValue); } @Override void writeInto(ByteBuffer buffer) { buffer.order(ByteOrder.LITTLE_ENDIAN).putInt(intValue); } public ByteBuffer toByteBuffer() { ByteBuffer buf = ByteBuffer.allocate(4); writeInto(buf); buf.flip(); return buf; } @Override public long size() { return 4; } } private static class UnsignedVarIntBytesInput extends BytesInput { private final int intValue; public UnsignedVarIntBytesInput(int intValue) { this.intValue = intValue; } @Override public void writeAllTo(OutputStream out) throws IOException { BytesUtils.writeUnsignedVarInt(intValue, out); } @Override void writeInto(ByteBuffer buffer) { try { BytesUtils.writeUnsignedVarInt(intValue, buffer); } catch (IOException e) { // It does not actually throw an I/O exception, but we cannot remove throws for compatibility throw new RuntimeException(e); } } public ByteBuffer toByteBuffer() { ByteBuffer ret = ByteBuffer.allocate((int) size()); writeInto(ret); ret.flip(); return ret; } @Override public long size() { int s = (38 - Integer.numberOfLeadingZeros(intValue)) / 7; return s == 0 ? 1 : s; } } private static class UnsignedVarLongBytesInput extends BytesInput { private final long longValue; public UnsignedVarLongBytesInput(long longValue) { this.longValue = longValue; } @Override public void writeAllTo(OutputStream out) throws IOException { BytesUtils.writeUnsignedVarLong(longValue, out); } @Override void writeInto(ByteBuffer buffer) { BytesUtils.writeUnsignedVarLong(longValue, buffer); } @Override public long size() { int s = (70 - Long.numberOfLeadingZeros(longValue)) / 7; return s == 0 ? 1 : s; } } private static class EmptyBytesInput extends BytesInput { @Override public void writeAllTo(OutputStream out) throws IOException {} @Override void writeInto(ByteBuffer buffer) { // no-op } @Override public long size() { return 0; } public ByteBuffer toByteBuffer() throws IOException { return ByteBuffer.allocate(0); } } private static class CapacityBAOSBytesInput extends BytesInput { private final CapacityByteArrayOutputStream arrayOut; private CapacityBAOSBytesInput(CapacityByteArrayOutputStream arrayOut) { this.arrayOut = arrayOut; } @Override public void writeAllTo(OutputStream out) throws IOException { arrayOut.writeTo(out); } @Override void writeInto(ByteBuffer buffer) { arrayOut.writeInto(buffer); } @Override ByteBuffer getInternalByteBuffer() { return arrayOut.getInternalByteBuffer(); } @Override public long size() { return arrayOut.size(); } } private static class BAOSBytesInput extends BytesInput { private final ByteArrayOutputStream arrayOut; private BAOSBytesInput(ByteArrayOutputStream arrayOut) { this.arrayOut = arrayOut; } @Override public void writeAllTo(OutputStream out) throws IOException { arrayOut.writeTo(out); } @Override void writeInto(ByteBuffer buffer) { buffer.put(arrayOut.toByteArray()); } @Override public long size() { return arrayOut.size(); } } private static class ByteArrayBytesInput extends BytesInput { private final byte[] in; private final int offset; private final int length; private ByteArrayBytesInput(byte[] in, int offset, int length) { this.in = in; this.offset = offset; this.length = length; } @Override public void writeAllTo(OutputStream out) throws IOException { out.write(in, offset, length); } @Override void writeInto(ByteBuffer buffer) { buffer.put(in, offset, length); } public ByteBuffer toByteBuffer() throws IOException { return java.nio.ByteBuffer.wrap(in, offset, length); } @Override public long size() { return length; } } private static class BufferListBytesInput extends BytesInput { private final List buffers; private final long length; public BufferListBytesInput(List buffers) { this.buffers = buffers; long totalLen = 0; for (ByteBuffer buffer : buffers) { totalLen += buffer.remaining(); } this.length = totalLen; } @Override public void writeAllTo(OutputStream out) throws IOException { WritableByteChannel channel = Channels.newChannel(out); for (ByteBuffer buffer : buffers) { channel.write(buffer.duplicate()); } } @Override void writeInto(ByteBuffer target) { for (ByteBuffer buffer : buffers) { target.put(buffer.duplicate()); } } @Override public ByteBufferInputStream toInputStream() { return ByteBufferInputStream.wrap(buffers); } @Override public long size() { return length; } } private static class ByteBufferBytesInput extends BytesInput { private final ByteBuffer buffer; private ByteBufferBytesInput(ByteBuffer buffer) { this.buffer = buffer; } @Override public void writeAllTo(OutputStream out) throws IOException { Channels.newChannel(out).write(buffer.duplicate()); } @Override void writeInto(ByteBuffer target) { target.put(buffer.duplicate()); } @Override ByteBuffer getInternalByteBuffer() { return buffer.slice(); } @Override public ByteBufferInputStream toInputStream() { return ByteBufferInputStream.wrap(buffer); } @Override public long size() { return buffer.remaining(); } @Override public ByteBuffer toByteBuffer() throws IOException { return buffer.slice(); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy