com.nvidia.spark.rapids.iceberg.parquet.GpuParquet Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark_2.13 Show documentation
Creates the distribution package of the RAPIDS plugin for Apache Spark
There is a newer version: 24.10.1
/*
 * Copyright (c) 2022-2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.nvidia.spark.rapids.iceberg.parquet;

import java.util.Collection;
import java.util.Map;

import com.nvidia.spark.rapids.GpuMetric;
import com.nvidia.spark.rapids.iceberg.data.GpuDeleteFilter;
import com.nvidia.spark.rapids.shims.PartitionedFileUtilsShim;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.Schema;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.hadoop.HadoopInputFile;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.parquet.HadoopReadOptions;
import org.apache.parquet.ParquetReadOptions;

import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.execution.datasources.PartitionedFile;
import org.apache.spark.sql.vectorized.ColumnarBatch;

/**
 * GPU version of Apache Iceberg's Parquet class.
 * The Iceberg version originally accepted a callback function to create the reader to handle
 * vectorized batch vs. row readers, but since the GPU only reads vectorized that abstraction
 * has been removed.
 */
public class GpuParquet {
  private static final Collection READ_PROPERTIES_TO_REMOVE = Sets.newHashSet(
      "parquet.read.filter", "parquet.private.read.filter.predicate", "parquet.read.support.class");

  private GpuParquet() {
  }

  public static ReadBuilder read(InputFile file) {
    return new ReadBuilder(file);
  }

  public static class ReadBuilder {
    private final InputFile file;
    private Long start = null;
    private Long length = null;
    private Schema projectSchema = null;
    private Map idToConstant = null;
    private GpuDeleteFilter deleteFilter = null;
    private Expression filter = null;
    private boolean caseSensitive = true;
    private NameMapping nameMapping = null;
    private Configuration conf = null;
    private int maxBatchSizeRows = Integer.MAX_VALUE;
    private long maxBatchSizeBytes = Integer.MAX_VALUE;
    private long targetBatchSizeBytes = Integer.MAX_VALUE;
    private boolean useChunkedReader = false;
    private long maxChunkedReaderMemoryUsageSizeBytes = 0;
    private scala.Option debugDumpPrefix = null;
    private boolean debugDumpAlways = false;
    private scala.collection.immutable.Map metrics = null;

    private ReadBuilder(InputFile file) {
      this.file = file;
    }

    /**
     * Restricts the read to the given range: [start, start + length).
     *
     * @param newStart the start position for this read
     * @param newLength the length of the range this read should scan
     * @return this builder for method chaining
     */
    public ReadBuilder split(long newStart, long newLength) {
      this.start = newStart;
      this.length = newLength;
      return this;
    }

    public ReadBuilder project(Schema newSchema) {
      this.projectSchema = newSchema;
      return this;
    }

    public ReadBuilder caseSensitive(boolean newCaseSensitive) {
      this.caseSensitive = newCaseSensitive;
      return this;
    }

    public ReadBuilder constants(Map constantsMap) {
      this.idToConstant = constantsMap;
      return this;
    }

    public ReadBuilder deleteFilter(GpuDeleteFilter deleteFilter) {
      this.deleteFilter = deleteFilter;
      return this;
    }

    public ReadBuilder filter(Expression newFilter) {
      this.filter = newFilter;
      return this;
    }

    public ReadBuilder withNameMapping(NameMapping newNameMapping) {
      this.nameMapping = newNameMapping;
      return this;
    }

    public ReadBuilder withConfiguration(Configuration conf) {
      this.conf = conf;
      return this;
    }

    public ReadBuilder withMaxBatchSizeRows(int maxBatchSizeRows) {
      this.maxBatchSizeRows = maxBatchSizeRows;
      return this;
    }

    public ReadBuilder withMaxBatchSizeBytes(long maxBatchSizeBytes) {
      this.maxBatchSizeBytes = maxBatchSizeBytes;
      return this;
    }

    public ReadBuilder withTargetBatchSizeBytes(long targetBatchSizeBytes) {
      this.targetBatchSizeBytes = targetBatchSizeBytes;
      return this;
    }

    public ReadBuilder withUseChunkedReader(boolean useChunkedReader,
        long maxChunkedReaderMemoryUsageSizeBytes) {
      this.useChunkedReader = useChunkedReader;
      this.maxChunkedReaderMemoryUsageSizeBytes = maxChunkedReaderMemoryUsageSizeBytes;
      return this;
    }

    public ReadBuilder withDebugDump(scala.Option dumpPrefix, boolean dumpAlways) {
      this.debugDumpPrefix = dumpPrefix;
      this.debugDumpAlways = dumpAlways;
      return this;
    }

    public ReadBuilder withMetrics(scala.collection.immutable.Map metrics) {
      this.metrics = metrics;
      return this;
    }

    public CloseableIterable build() {
      ParquetReadOptions options = buildReaderOptions(file, start, length);
      PartitionedFile partFile = PartitionedFileUtilsShim.newPartitionedFile(
          InternalRow.empty(), file.location(), start, length);
      return new GpuParquetReader(file, projectSchema, options, nameMapping, filter, caseSensitive,
          idToConstant, deleteFilter, partFile, conf, maxBatchSizeRows, maxBatchSizeBytes,
          targetBatchSizeBytes, useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes,
          debugDumpPrefix, debugDumpAlways, metrics);
    }
  }

  public static ParquetReadOptions buildReaderOptions(InputFile file, Long start, Long length) {
    ParquetReadOptions.Builder optionsBuilder;
    if (file instanceof HadoopInputFile) {
      // remove read properties already set that may conflict with this read
      Configuration conf = new Configuration(((HadoopInputFile) file).getConf());
      for (String property : READ_PROPERTIES_TO_REMOVE) {
        conf.unset(property);
      }
      optionsBuilder = HadoopReadOptions.builder(conf);
    } else {
      //optionsBuilder = ParquetReadOptions.builder();
      throw new UnsupportedOperationException("Only Hadoop files are supported for now");
    }

    if (start != null) {
      optionsBuilder.withRange(start, start + length);
    }

    return optionsBuilder.build();
  }
}