All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.comet.parquet.Native Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.comet.parquet;

import java.nio.ByteBuffer;

import org.apache.comet.NativeBase;

public final class Native extends NativeBase {
  public static int[] readBatch(long handle, int batchSize) {
    return readBatch(handle, batchSize, 0);
  }

  public static int skipBatch(long handle, int batchSize) {
    return skipBatch(handle, batchSize, false);
  }

  /** Native APIs * */

  /**
   * Creates a reader for a primitive Parquet column.
   *
   * @param physicalTypeId id for Parquet physical type
   * @param logicalTypeId id for Parquet logical type
   * @param expectedPhysicalTypeId id for Parquet physical type, converted from Spark read type.
   *     This is used for type promotion.
   * @param path the path from the root schema to the column, derived from the method
   *     'ColumnDescriptor#getPath()'.
   * @param maxDl the maximum definition level of the primitive column
   * @param maxRl the maximum repetition level of the primitive column
   * @param bitWidth (only set when logical type is INT) the bit width for the integer type (INT8,
   *     INT16, INT32, etc)
   * @param isSigned (only set when logical type is INT) whether it is signed or unsigned int.
   * @param typeLength number of bytes required to store a value of the type, only set when the
   *     physical type is FIXED_LEN_BYTE_ARRAY, otherwise it's 0.
   * @param precision (only set when logical type is DECIMAL) precision of the decimal type
   * @param expectedPrecision (only set when logical type is DECIMAL) precision of the decimal type
   *     from Spark read schema. This is used for type promotion.
   * @param scale (only set when logical type is DECIMAL) scale of the decimal type
   * @param tu (only set when logical type is TIMESTAMP) unit for the timestamp
   * @param isAdjustedUtc (only set when logical type is TIMESTAMP) whether the timestamp is
   *     adjusted to UTC or not
   * @param batchSize the batch size for the columnar read
   * @param useDecimal128 whether to always return 128 bit decimal regardless of precision
   * @param useLegacyDateTimestampOrNTZ whether to read legacy dates/timestamps as it is
   * @return a pointer to a native Parquet column reader created
   */
  public static native long initColumnReader(
      int physicalTypeId,
      int logicalTypeId,
      int expectedPhysicalTypeId,
      String[] path,
      int maxDl,
      int maxRl,
      int bitWidth,
      int expectedBitWidth,
      boolean isSigned,
      int typeLength,
      int precision,
      int expectedPrecision,
      int scale,
      int expectedScale,
      int tu,
      boolean isAdjustedUtc,
      int batchSize,
      boolean useDecimal128,
      boolean useLegacyDateTimestampOrNTZ);

  /**
   * Pass a Parquet dictionary page to the native column reader. Note this should only be called
   * once per Parquet column chunk. Otherwise it'll panic.
   *
   * @param handle the handle to the native Parquet column reader
   * @param dictionaryValueCount the number of values in this dictionary
   * @param dictionaryData the actual dictionary page data, including repetition/definition levels
   *     as well as values
   * @param encoding the encoding used by the dictionary
   */
  public static native void setDictionaryPage(
      long handle, int dictionaryValueCount, byte[] dictionaryData, int encoding);

  /**
   * Passes a Parquet data page V1 to the native column reader.
   *
   * @param handle the handle to the native Parquet column reader
   * @param pageValueCount the number of values in this data page
   * @param pageData the actual page data, which should only contain PLAIN-encoded values.
   * @param valueEncoding the encoding used by the values
   */
  public static native void setPageV1(
      long handle, int pageValueCount, byte[] pageData, int valueEncoding);

  /**
   * Passes a Parquet data page V1 to the native column reader.
   *
   * @param handle the handle to the native Parquet column reader
   * @param pageValueCount the number of values in this data page
   * @param buffer the actual page data, represented by a DirectByteBuffer.
   * @param valueEncoding the encoding used by the values
   */
  public static native void setPageBufferV1(
      long handle, int pageValueCount, ByteBuffer buffer, int valueEncoding);

  /**
   * Passes a Parquet data page V2 to the native column reader.
   *
   * @param handle the handle to the native Parquet column reader
   * @param pageValueCount the number of values in this data page
   * @param defLevelData the data for definition levels
   * @param repLevelData the data for repetition levels
   * @param valueData the data for values
   * @param valueEncoding the encoding used by the values
   */
  public static native void setPageV2(
      long handle,
      int pageValueCount,
      byte[] defLevelData,
      byte[] repLevelData,
      byte[] valueData,
      int valueEncoding);

  /**
   * Reset the current columnar batch. This will clear all the content of the batch as well as any
   * internal state such as the current offset.
   *
   * @param handle the handle to the native Parquet column reader
   */
  public static native void resetBatch(long handle);

  /**
   * Reads at most 'batchSize' number of rows from the native Parquet column reader. Returns a tuple
   * where the first element is the actual number of rows read (including both nulls and non-nulls),
   * and the second element is the number of nulls read.
   *
   * 

If the returned value is < 'batchSize' then it means the current page has been completely * drained. In this case, the caller should call {@link Native#setPageV1} or {@link * Native#setPageV2} before the next 'readBatch' call. * *

Note that the current page could also be drained if the returned value = 'batchSize', i.e., * the remaining number of rows in the page is exactly equal to 'batchSize'. In this case, the * next 'readBatch' call will return 0 and the caller should call {@link Native#setPageV1} or * {@link Native#setPageV2} next. * *

If `nullPadSize` > 0, it pads nulls into the underlying vector before the values will be * read into. * * @param handle the handle to the native Parquet column reader * @param batchSize the number of rows to be read * @param nullPadSize the number of nulls to pad before reading. * @return a tuple: (the actual number of rows read, the number of nulls read) */ public static native int[] readBatch(long handle, int batchSize, int nullPadSize); /** * Skips at most 'batchSize' number of rows from the native Parquet column reader, and returns the * actual number of rows skipped. * *

If the returned value is < 'batchSize' then it means the current page has been completely * drained. In this case, the caller should call {@link Native#setPageV1} or {@link * Native#setPageV2} before the next 'skipBatch' call. * *

Note that the current page could also be drained if the returned value = 'batchSize', i.e., * the remaining number of rows in the page is exactly equal to 'batchSize'. In this case, the * next 'skipBatch' call will return 0 and the caller should call {@link Native#setPageV1} or * {@link Native#setPageV2} next. * * @param handle the handle to the native Parquet column reader * @param batchSize the number of rows to skip in the current page * @param discard if true, discard read rows without padding nulls into the underlying vector * @return the actual number of rows skipped */ public static native int skipBatch(long handle, int batchSize, boolean discard); /** * Returns the current batch constructed via 'readBatch' * * @param handle the handle to the native Parquet column reader * @return a long array with 2 elements, the first is the address to native Arrow array, and the * second is the address to the Arrow schema. */ public static native long[] currentBatch(long handle); /** Set methods to set a constant value for the reader, so it'll return constant vectors */ public static native void setNull(long handle); public static native void setBoolean(long handle, boolean value); public static native void setByte(long handle, byte value); public static native void setShort(long handle, short value); public static native void setInt(long handle, int value); public static native void setLong(long handle, long value); public static native void setFloat(long handle, float value); public static native void setDouble(long handle, double value); public static native void setBinary(long handle, byte[] value); /** Set decimal backed by FixedLengthByteArray */ public static native void setDecimal(long handle, byte[] value); /** Set position of row index vector for Iceberg Metadata Column */ public static native void setPosition(long handle, long value, int size); /** Set row index vector for Spark row index metadata column and return vector size */ public static native int setIndices(long handle, long offset, int size, long[] indices); /** Set deleted info for Iceberg Metadata Column */ public static native void setIsDeleted(long handle, boolean[] isDeleted); /** * Closes the native Parquet column reader and releases all resources associated with it. * * @param handle the handle to the native Parquet column reader */ public static native void closeColumnReader(long handle); }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy