org.apache.comet.parquet.Native Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of comet-spark-spark3.3_2.13
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.comet.parquet;

import java.nio.ByteBuffer;

import org.apache.comet.NativeBase;

public final class Native extends NativeBase {
  public static int[] readBatch(long handle, int batchSize) {
    return readBatch(handle, batchSize, 0);
  }

  public static int skipBatch(long handle, int batchSize) {
    return skipBatch(handle, batchSize, false);
  }

  /** Native APIs * */

  /**
   * Creates a reader for a primitive Parquet column.
   *
   * @param physicalTypeId id for Parquet physical type
   * @param logicalTypeId id for Parquet logical type
   * @param expectedPhysicalTypeId id for Parquet physical type, converted from Spark read type.
   *     This is used for type promotion.
   * @param path the path from the root schema to the column, derived from the method
   *     'ColumnDescriptor#getPath()'.
   * @param maxDl the maximum definition level of the primitive column
   * @param maxRl the maximum repetition level of the primitive column
   * @param bitWidth (only set when logical type is INT) the bit width for the integer type (INT8,
   *     INT16, INT32, etc)
   * @param isSigned (only set when logical type is INT) whether it is signed or unsigned int.
   * @param typeLength number of bytes required to store a value of the type, only set when the
   *     physical type is FIXED_LEN_BYTE_ARRAY, otherwise it's 0.
   * @param precision (only set when logical type is DECIMAL) precision of the decimal type
   * @param expectedPrecision (only set when logical type is DECIMAL) precision of the decimal type
   *     from Spark read schema. This is used for type promotion.
   * @param scale (only set when logical type is DECIMAL) scale of the decimal type
   * @param tu (only set when logical type is TIMESTAMP) unit for the timestamp
   * @param isAdjustedUtc (only set when logical type is TIMESTAMP) whether the timestamp is
   *     adjusted to UTC or not
   * @param batchSize the batch size for the columnar read
   * @param useDecimal128 whether to always return 128 bit decimal regardless of precision
   * @param useLegacyDateTimestampOrNTZ whether to read legacy dates/timestamps as it is
   * @return a pointer to a native Parquet column reader created
   */
  public static native long initColumnReader(
      int physicalTypeId,
      int logicalTypeId,
      int expectedPhysicalTypeId,
      String[] path,
      int maxDl,
      int maxRl,
      int bitWidth,
      int expectedBitWidth,
      boolean isSigned,
      int typeLength,
      int precision,
      int expectedPrecision,
      int scale,
      int expectedScale,
      int tu,
      boolean isAdjustedUtc,
      int batchSize,
      boolean useDecimal128,
      boolean useLegacyDateTimestampOrNTZ);

  /**
   * Pass a Parquet dictionary page to the native column reader. Note this should only be called
   * once per Parquet column chunk. Otherwise it'll panic.
   *
   * @param handle the handle to the native Parquet column reader
   * @param dictionaryValueCount the number of values in this dictionary
   * @param dictionaryData the actual dictionary page data, including repetition/definition levels
   *     as well as values
   * @param encoding the encoding used by the dictionary
   */
  public static native void setDictionaryPage(
      long handle, int dictionaryValueCount, byte[] dictionaryData, int encoding);

  /**
   * Passes a Parquet data page V1 to the native column reader.
   *
   * @param handle the handle to the native Parquet column reader
   * @param pageValueCount the number of values in this data page
   * @param pageData the actual page data, which should only contain PLAIN-encoded values.
   * @param valueEncoding the encoding used by the values
   */
  public static native void setPageV1(
      long handle, int pageValueCount, byte[] pageData, int valueEncoding);

  /**
   * Passes a Parquet data page V1 to the native column reader.
   *
   * @param handle the handle to the native Parquet column reader
   * @param pageValueCount the number of values in this data page
   * @param buffer the actual page data, represented by a DirectByteBuffer.
   * @param valueEncoding the encoding used by the values
   */
  public static native void setPageBufferV1(
      long handle, int pageValueCount, ByteBuffer buffer, int valueEncoding);

  /**
   * Passes a Parquet data page V2 to the native column reader.
   *
   * @param handle the handle to the native Parquet column reader
   * @param pageValueCount the number of values in this data page
   * @param defLevelData the data for definition levels
   * @param repLevelData the data for repetition levels
   * @param valueData the data for values
   * @param valueEncoding the encoding used by the values
   */
  public static native void setPageV2(
      long handle,
      int pageValueCount,
      byte[] defLevelData,
      byte[] repLevelData,
      byte[] valueData,
      int valueEncoding);

  /**
   * Reset the current columnar batch. This will clear all the content of the batch as well as any
   * internal state such as the current offset.
   *
   * @param handle the handle to the native Parquet column reader
   */
  public static native void resetBatch(long handle);

  /**
   * Reads at most 'batchSize' number of rows from the native Parquet column reader. Returns a tuple
   * where the first element is the actual number of rows read (including both nulls and non-nulls),
   * and the second element is the number of nulls read.
   *
   * If the returned value is < 'batchSize' then it means the current page has been completely
   * drained. In this case, the caller should call {@link Native#setPageV1} or {@link
   * Native#setPageV2} before the next 'readBatch' call.
   *
   * 
Note that the current page could also be drained if the returned value = 'batchSize', i.e.,
   * the remaining number of rows in the page is exactly equal to 'batchSize'. In this case, the
   * next 'readBatch' call will return 0 and the caller should call {@link Native#setPageV1} or
   * {@link Native#setPageV2} next.
   *
   * 
If `nullPadSize` > 0, it pads nulls into the underlying vector before the values will be
   * read into.
   *
   * @param handle the handle to the native Parquet column reader
   * @param batchSize the number of rows to be read
   * @param nullPadSize the number of nulls to pad before reading.
   * @return a tuple: (the actual number of rows read, the number of nulls read)
   */
  public static native int[] readBatch(long handle, int batchSize, int nullPadSize);

  /**
   * Skips at most 'batchSize' number of rows from the native Parquet column reader, and returns the
   * actual number of rows skipped.
   *
   * 
If the returned value is < 'batchSize' then it means the current page has been completely
   * drained. In this case, the caller should call {@link Native#setPageV1} or {@link
   * Native#setPageV2} before the next 'skipBatch' call.
   *
   * Note that the current page could also be drained if the returned value = 'batchSize', i.e.,
   * the remaining number of rows in the page is exactly equal to 'batchSize'. In this case, the
   * next 'skipBatch' call will return 0 and the caller should call {@link Native#setPageV1} or
   * {@link Native#setPageV2} next.
   *
   * @param handle the handle to the native Parquet column reader
   * @param batchSize the number of rows to skip in the current page
   * @param discard if true, discard read rows without padding nulls into the underlying vector
   * @return the actual number of rows skipped
   */
  public static native int skipBatch(long handle, int batchSize, boolean discard);

  /**
   * Returns the current batch constructed via 'readBatch'
   *
   * @param handle the handle to the native Parquet column reader
   * @return a long array with 2 elements, the first is the address to native Arrow array, and the
   *     second is the address to the Arrow schema.
   */
  public static native long[] currentBatch(long handle);

  /** Set methods to set a constant value for the reader, so it'll return constant vectors */
  public static native void setNull(long handle);

  public static native void setBoolean(long handle, boolean value);

  public static native void setByte(long handle, byte value);

  public static native void setShort(long handle, short value);

  public static native void setInt(long handle, int value);

  public static native void setLong(long handle, long value);

  public static native void setFloat(long handle, float value);

  public static native void setDouble(long handle, double value);

  public static native void setBinary(long handle, byte[] value);

  /** Set decimal backed by FixedLengthByteArray */
  public static native void setDecimal(long handle, byte[] value);

  /** Set position of row index vector for Iceberg Metadata Column */
  public static native void setPosition(long handle, long value, int size);

  /** Set row index vector for Spark row index metadata column and return vector size */
  public static native int setIndices(long handle, long offset, int size, long[] indices);

  /** Set deleted info for Iceberg Metadata Column */
  public static native void setIsDeleted(long handle, boolean[] isDeleted);

  /**
   * Closes the native Parquet column reader and releases all resources associated with it.
   *
   * @param handle the handle to the native Parquet column reader
   */
  public static native void closeColumnReader(long handle);
}