org.apache.comet.parquet.BatchReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of comet-spark-spark3.3_2.13
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.comet.parquet;

import java.io.Closeable;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;

import scala.Option;
import scala.collection.Seq;
import scala.collection.mutable.Buffer;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.comet.shaded.arrow.c.CometSchemaImporter;
import org.apache.comet.shaded.arrow.memory.BufferAllocator;
import org.apache.comet.shaded.arrow.memory.RootAllocator;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.parquet.HadoopReadOptions;
import org.apache.parquet.ParquetReadOptions;
import org.apache.parquet.Preconditions;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Type;
import org.apache.spark.TaskContext;
import org.apache.spark.TaskContext$;
import org.apache.spark.executor.TaskMetrics;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.comet.parquet.CometParquetReadSupport;
import org.apache.spark.sql.execution.datasources.PartitionedFile;
import org.apache.spark.sql.execution.datasources.parquet.ParquetToSparkSchemaConverter;
import org.apache.spark.sql.execution.metric.SQLMetric;
import org.apache.spark.sql.types.*;
import org.apache.spark.sql.vectorized.ColumnarBatch;
import org.apache.spark.util.AccumulatorV2;

import org.apache.comet.CometConf;
import org.apache.comet.shims.ShimBatchReader;
import org.apache.comet.shims.ShimFileFormat;
import org.apache.comet.vector.CometVector;

/**
 * A vectorized Parquet reader that reads a Parquet file in a batched fashion.
 *
 * Example of how to use this:
 *
 * 
 *   BatchReader reader = new BatchReader(parquetFile, batchSize);
 *   try {
 *     reader.init();
 *     while (reader.readBatch()) {
 *       ColumnarBatch batch = reader.currentBatch();
 *       // consume the batch
 *     }
 *   } finally { // resources associated with the reader should be released
 *     reader.close();
 *   }
 * 
 */
public class BatchReader extends RecordReader implements Closeable {
  private static final Logger LOG = LoggerFactory.getLogger(FileReader.class);
  protected static final BufferAllocator ALLOCATOR = new RootAllocator();

  private Configuration conf;
  private int capacity;
  private boolean isCaseSensitive;
  private boolean useFieldId;
  private boolean ignoreMissingIds;
  private StructType partitionSchema;
  private InternalRow partitionValues;
  private PartitionedFile file;
  private final Map metrics;

  private long rowsRead;
  private StructType sparkSchema;
  private MessageType requestedSchema;
  private CometVector[] vectors;
  private AbstractColumnReader[] columnReaders;
  private CometSchemaImporter importer;
  private ColumnarBatch currentBatch;
  private Future> prefetchTask;
  private LinkedBlockingQueue> prefetchQueue;
  private FileReader fileReader;
  private boolean[] missingColumns;
  private boolean isInitialized;
  private ParquetMetadata footer;

  /** The total number of rows across all row groups of the input split. */
  private long totalRowCount;

  /**
   * The total number of rows loaded so far, including all the rows from row groups that we've
   * processed and the current row group.
   */
  private long totalRowsLoaded;

  /**
   * Whether the native scan should always return decimal represented by 128 bits, regardless of its
   * precision. Normally, this should be true if native execution is enabled, since Arrow compute
   * kernels doesn't support 32 and 64 bit decimals yet.
   */
  private boolean useDecimal128;

  /** Whether to use the lazy materialization reader for reading columns. */
  private boolean useLazyMaterialization;

  /**
   * Whether to return dates/timestamps that were written with legacy hybrid (Julian + Gregorian)
   * calendar as it is. If this is true, Comet will return them as it is, instead of rebasing them
   * to the new Proleptic Gregorian calendar. If this is false, Comet will throw exceptions when
   * seeing these dates/timestamps.
   */
  private boolean useLegacyDateTimestamp;

  /** The TaskContext object for executing this task. */
  private final TaskContext taskContext;

  // Only for testing
  public BatchReader(String file, int capacity) {
    this(file, capacity, null, null);
  }

  // Only for testing
  public BatchReader(
      String file, int capacity, StructType partitionSchema, InternalRow partitionValues) {
    this(new Configuration(), file, capacity, partitionSchema, partitionValues);
  }

  // Only for testing
  public BatchReader(
      Configuration conf,
      String file,
      int capacity,
      StructType partitionSchema,
      InternalRow partitionValues) {
    conf.set("spark.sql.parquet.binaryAsString", "false");
    conf.set("spark.sql.parquet.int96AsTimestamp", "false");
    conf.set("spark.sql.caseSensitive", "false");
    conf.set("spark.sql.parquet.inferTimestampNTZ.enabled", "true");
    conf.set("spark.sql.legacy.parquet.nanosAsLong", "false");

    this.conf = conf;
    this.capacity = capacity;
    this.isCaseSensitive = false;
    this.useFieldId = false;
    this.ignoreMissingIds = false;
    this.partitionSchema = partitionSchema;
    this.partitionValues = partitionValues;

    this.file = ShimBatchReader.newPartitionedFile(partitionValues, file);
    this.metrics = new HashMap<>();

    this.taskContext = TaskContext$.MODULE$.get();
  }

  public BatchReader(AbstractColumnReader[] columnReaders) {
    // Todo: set useDecimal128 and useLazyMaterialization
    int numColumns = columnReaders.length;
    this.columnReaders = new AbstractColumnReader[numColumns];
    vectors = new CometVector[numColumns];
    currentBatch = new ColumnarBatch(vectors);
    // This constructor is used by Iceberg only. The columnReaders are
    // initialized in Iceberg, so no need to call the init()
    isInitialized = true;
    this.taskContext = TaskContext$.MODULE$.get();
    this.metrics = new HashMap<>();
  }

  BatchReader(
      Configuration conf,
      PartitionedFile inputSplit,
      ParquetMetadata footer,
      int capacity,
      StructType sparkSchema,
      boolean isCaseSensitive,
      boolean useFieldId,
      boolean ignoreMissingIds,
      boolean useLegacyDateTimestamp,
      StructType partitionSchema,
      InternalRow partitionValues,
      Map metrics) {
    this.conf = conf;
    this.capacity = capacity;
    this.sparkSchema = sparkSchema;
    this.isCaseSensitive = isCaseSensitive;
    this.useFieldId = useFieldId;
    this.ignoreMissingIds = ignoreMissingIds;
    this.useLegacyDateTimestamp = useLegacyDateTimestamp;
    this.partitionSchema = partitionSchema;
    this.partitionValues = partitionValues;
    this.file = inputSplit;
    this.footer = footer;
    this.metrics = metrics;
    this.taskContext = TaskContext$.MODULE$.get();
  }

  /**
   * Initialize this reader. The reason we don't do it in the constructor is that we want to close
   * any resource hold by this reader when error happens during the initialization.
   */
  public void init() throws URISyntaxException, IOException {
    useDecimal128 =
        conf.getBoolean(
            CometConf.COMET_USE_DECIMAL_128().key(),
            (Boolean) CometConf.COMET_USE_DECIMAL_128().defaultValue().get());
    useLazyMaterialization =
        conf.getBoolean(
            CometConf.COMET_USE_LAZY_MATERIALIZATION().key(),
            (Boolean) CometConf.COMET_USE_LAZY_MATERIALIZATION().defaultValue().get());

    long start = file.start();
    long length = file.length();
    String filePath = file.filePath().toString();

    ParquetReadOptions.Builder builder = HadoopReadOptions.builder(conf, new Path(filePath));

    if (start >= 0 && length >= 0) {
      builder = builder.withRange(start, start + length);
    }
    ParquetReadOptions readOptions = builder.build();

    // TODO: enable off-heap buffer when they are ready
    ReadOptions cometReadOptions = ReadOptions.builder(conf).build();

    Path path = new Path(new URI(filePath));
    fileReader =
        new FileReader(
            CometInputFile.fromPath(path, conf), footer, readOptions, cometReadOptions, metrics);
    requestedSchema = fileReader.getFileMetaData().getSchema();
    MessageType fileSchema = requestedSchema;

    if (sparkSchema == null) {
      sparkSchema = new ParquetToSparkSchemaConverter(conf).convert(requestedSchema);
    } else {
      requestedSchema =
          CometParquetReadSupport.clipParquetSchema(
              requestedSchema, sparkSchema, isCaseSensitive, useFieldId, ignoreMissingIds);
      if (requestedSchema.getColumns().size() != sparkSchema.size()) {
        throw new IllegalArgumentException(
            String.format(
                "Spark schema has %d columns while " + "Parquet schema has %d columns",
                sparkSchema.size(), requestedSchema.getColumns().size()));
      }
    }

    totalRowCount = fileReader.getRecordCount();
    List columns = requestedSchema.getColumns();
    int numColumns = columns.size();
    if (partitionSchema != null) numColumns += partitionSchema.size();
    columnReaders = new AbstractColumnReader[numColumns];

    // Initialize missing columns and use null vectors for them
    missingColumns = new boolean[columns.size()];
    List paths = requestedSchema.getPaths();
    StructField[] nonPartitionFields = sparkSchema.fields();
    ShimFileFormat.findRowIndexColumnIndexInSchema(sparkSchema);
    for (int i = 0; i < requestedSchema.getFieldCount(); i++) {
      Type t = requestedSchema.getFields().get(i);
      Preconditions.checkState(
          t.isPrimitive() && !t.isRepetition(Type.Repetition.REPEATED),
          "Complex type is not supported");
      String[] colPath = paths.get(i);
      if (nonPartitionFields[i].name().equals(ShimFileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME())) {
        // Values of ROW_INDEX_TEMPORARY_COLUMN_NAME column are always populated with
        // generated row indexes, rather than read from the file.
        // TODO(SPARK-40059): Allow users to include columns named
        //                    FileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME in their schemas.
        long[] rowIndices = fileReader.getRowIndices();
        columnReaders[i] = new RowIndexColumnReader(nonPartitionFields[i], capacity, rowIndices);
        missingColumns[i] = true;
      } else if (fileSchema.containsPath(colPath)) {
        ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
        if (!fd.equals(columns.get(i))) {
          throw new UnsupportedOperationException("Schema evolution is not supported");
        }
        missingColumns[i] = false;
      } else {
        if (columns.get(i).getMaxDefinitionLevel() == 0) {
          throw new IOException(
              "Required column '"
                  + Arrays.toString(colPath)
                  + "' is missing"
                  + " in data file "
                  + filePath);
        }
        ConstantColumnReader reader =
            new ConstantColumnReader(nonPartitionFields[i], capacity, useDecimal128);
        columnReaders[i] = reader;
        missingColumns[i] = true;
      }
    }

    // Initialize constant readers for partition columns
    if (partitionSchema != null) {
      StructField[] partitionFields = partitionSchema.fields();
      for (int i = columns.size(); i < columnReaders.length; i++) {
        int fieldIndex = i - columns.size();
        StructField field = partitionFields[fieldIndex];
        ConstantColumnReader reader =
            new ConstantColumnReader(field, capacity, partitionValues, fieldIndex, useDecimal128);
        columnReaders[i] = reader;
      }
    }

    vectors = new CometVector[numColumns];
    currentBatch = new ColumnarBatch(vectors);
    fileReader.setRequestedSchema(requestedSchema.getColumns());

    // For test purpose only
    // If the last external accumulator is `NumRowGroupsAccumulator`, the row group number to read
    // will be updated to the accumulator. So we can check if the row groups are filtered or not
    // in test case.
    // Note that this tries to get thread local TaskContext object, if this is called at other
    // thread, it won't update the accumulator.
    if (taskContext != null) {
      Option> accu = getTaskAccumulator(taskContext.taskMetrics());
      if (accu.isDefined() && accu.get().getClass().getSimpleName().equals("NumRowGroupsAcc")) {
        @SuppressWarnings("unchecked")
        AccumulatorV2 intAccum = (AccumulatorV2) accu.get();
        intAccum.add(fileReader.getRowGroups().size());
      }
    }

    // Pre-fetching
    boolean preFetchEnabled =
        conf.getBoolean(
            CometConf.COMET_SCAN_PREFETCH_ENABLED().key(),
            (boolean) CometConf.COMET_SCAN_PREFETCH_ENABLED().defaultValue().get());

    if (preFetchEnabled) {
      LOG.info("Prefetch enabled for BatchReader.");
      this.prefetchQueue = new LinkedBlockingQueue<>();
    }

    isInitialized = true;
    synchronized (this) {
      // if prefetch is enabled, `init()` is called in separate thread. When
      // `BatchReader.nextBatch()` is called asynchronously, it is possibly that
      // `init()` is not called or finished. We need to hold on `nextBatch` until
      // initialization of `BatchReader` is done. Once we are close to finish
      // initialization, we notify the waiting thread of `nextBatch` to continue.
      notifyAll();
    }
  }

  public void setSparkSchema(StructType schema) {
    this.sparkSchema = schema;
  }

  public AbstractColumnReader[] getColumnReaders() {
    return columnReaders;
  }

  @Override
  public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
      throws IOException, InterruptedException {
    // Do nothing. The initialization work is done in 'init' already.
  }

  @Override
  public boolean nextKeyValue() throws IOException {
    return nextBatch();
  }

  @Override
  public Void getCurrentKey() {
    return null;
  }

  @Override
  public ColumnarBatch getCurrentValue() {
    return currentBatch();
  }

  @Override
  public float getProgress() {
    return (float) rowsRead / totalRowCount;
  }

  /**
   * Returns the current columnar batch being read.
   *
   * Note that this must be called AFTER {@link BatchReader#nextBatch()}.
   */
  public ColumnarBatch currentBatch() {
    return currentBatch;
  }

  // Only for testing
  public Future> getPrefetchTask() {
    return this.prefetchTask;
  }

  // Only for testing
  public LinkedBlockingQueue> getPrefetchQueue() {
    return this.prefetchQueue;
  }

  /**
   * Loads the next batch of rows.
   *
   * @return true if there are no more rows to read, false otherwise.
   */
  public boolean nextBatch() throws IOException {
    if (this.prefetchTask == null) {
      Preconditions.checkState(isInitialized, "init() should be called first!");
    } else {
      // If prefetch is enabled, this reader will be initialized asynchronously from a
      // different thread. Wait until it is initialized
      while (!isInitialized) {
        synchronized (this) {
          try {
            // Wait until initialization of current `BatchReader` is finished (i.e., `init()`),
            // is done. It is possibly that `init()` is done after entering this while loop,
            // so a short timeout is given.
            wait(100);

            // Checks if prefetch task is finished. If so, tries to get exception if any.
            if (prefetchTask.isDone()) {
              Option exception = prefetchTask.get();
              if (exception.isDefined()) {
                throw exception.get();
              }
            }
          } catch (RuntimeException e) {
            // Spark will check certain exception e.g. `SchemaColumnConvertNotSupportedException`.
            throw e;
          } catch (Throwable e) {
            throw new IOException(e);
          }
        }
      }
    }

    if (rowsRead >= totalRowCount) return false;
    boolean hasMore;

    try {
      hasMore = loadNextRowGroupIfNecessary();
    } catch (RuntimeException e) {
      // Spark will check certain exception e.g. `SchemaColumnConvertNotSupportedException`.
      throw e;
    } catch (Throwable e) {
      throw new IOException(e);
    }

    if (!hasMore) return false;
    int batchSize = (int) Math.min(capacity, totalRowsLoaded - rowsRead);

    return nextBatch(batchSize);
  }

  public boolean nextBatch(int batchSize) {
    long totalDecodeTime = 0, totalLoadTime = 0;
    for (int i = 0; i < columnReaders.length; i++) {
      AbstractColumnReader reader = columnReaders[i];
      long startNs = System.nanoTime();
      reader.readBatch(batchSize);
      totalDecodeTime += System.nanoTime() - startNs;
      startNs = System.nanoTime();
      vectors[i] = reader.currentBatch();
      totalLoadTime += System.nanoTime() - startNs;
    }

    SQLMetric decodeMetric = metrics.get("ParquetNativeDecodeTime");
    if (decodeMetric != null) {
      decodeMetric.add(totalDecodeTime);
    }
    SQLMetric loadMetric = metrics.get("ParquetNativeLoadTime");
    if (loadMetric != null) {
      loadMetric.add(totalLoadTime);
    }

    currentBatch.setNumRows(batchSize);
    rowsRead += batchSize;
    return true;
  }

  @Override
  public void close() throws IOException {
    if (columnReaders != null) {
      for (AbstractColumnReader reader : columnReaders) {
        if (reader != null) {
          reader.close();
        }
      }
    }
    if (fileReader != null) {
      fileReader.close();
      fileReader = null;
    }
    if (importer != null) {
      importer.close();
      importer = null;
    }
  }

  @SuppressWarnings("deprecation")
  private boolean loadNextRowGroupIfNecessary() throws Throwable {
    // More rows can be read from loaded row group. No need to load next one.
    if (rowsRead != totalRowsLoaded) return true;

    SQLMetric rowGroupTimeMetric = metrics.get("ParquetLoadRowGroupTime");
    SQLMetric numRowGroupsMetric = metrics.get("ParquetRowGroups");
    long startNs = System.nanoTime();

    PageReadStore rowGroupReader = null;
    if (prefetchTask != null && prefetchQueue != null) {
      // Wait for pre-fetch task to finish.
      Pair rowGroupReaderPair = prefetchQueue.take();
      rowGroupReader = rowGroupReaderPair.getLeft();

      // Update incremental byte read metric. Because this metric in Spark is maintained
      // by thread local variable, we need to manually update it.
      // TODO: We may expose metrics from `FileReader` and get from it directly.
      long incBytesRead = rowGroupReaderPair.getRight();
      FileSystem.getAllStatistics().stream()
          .forEach(statistic -> statistic.incrementBytesRead(incBytesRead));
    } else {
      rowGroupReader = fileReader.readNextRowGroup();
    }

    if (rowGroupTimeMetric != null) {
      rowGroupTimeMetric.add(System.nanoTime() - startNs);
    }
    if (rowGroupReader == null) {
      return false;
    }
    if (numRowGroupsMetric != null) {
      numRowGroupsMetric.add(1);
    }

    if (importer != null) importer.close();
    importer = new CometSchemaImporter(ALLOCATOR);

    List columns = requestedSchema.getColumns();
    for (int i = 0; i < columns.size(); i++) {
      if (missingColumns[i]) continue;
      if (columnReaders[i] != null) columnReaders[i].close();
      // TODO: handle tz, datetime & int96 rebase
      // TODO: consider passing page reader via ctor - however we need to fix the shading issue
      //   from Iceberg side.
      DataType dataType = sparkSchema.fields()[i].dataType();
      ColumnReader reader =
          Utils.getColumnReader(
              dataType,
              columns.get(i),
              importer,
              capacity,
              useDecimal128,
              useLazyMaterialization,
              useLegacyDateTimestamp);
      reader.setPageReader(rowGroupReader.getPageReader(columns.get(i)));
      columnReaders[i] = reader;
    }
    totalRowsLoaded += rowGroupReader.getRowCount();
    return true;
  }

  // Submits a prefetch task for this reader.
  public void submitPrefetchTask(ExecutorService threadPool) {
    this.prefetchTask = threadPool.submit(new PrefetchTask());
  }

  // A task for prefetching parquet row groups.
  private class PrefetchTask implements Callable> {
    private long getBytesRead() {
      return FileSystem.getAllStatistics().stream()
          .mapToLong(s -> s.getThreadStatistics().getBytesRead())
          .sum();
    }

    @Override
    public Option call() throws Exception {
      // Gets the bytes read so far.
      long baseline = getBytesRead();

      try {
        init();

        while (true) {
          PageReadStore rowGroupReader = fileReader.readNextRowGroup();

          if (rowGroupReader == null) {
            // Reaches the end of row groups.
            return Option.empty();
          } else {
            long incBytesRead = getBytesRead() - baseline;

            prefetchQueue.add(Pair.of(rowGroupReader, incBytesRead));
          }
        }
      } catch (Throwable e) {
        // Returns exception thrown from the reader. The reader will re-throw it.
        return Option.apply(e);
      } finally {
        if (fileReader != null) {
          fileReader.closeStream();
        }
      }
    }
  }

  // Signature of externalAccums changed from returning a Buffer to returning a Seq. If comet is
  // expecting a Buffer but the Spark version returns a Seq or vice versa, we get a
  // method not found exception.
  @SuppressWarnings("unchecked")
  private Option> getTaskAccumulator(TaskMetrics taskMetrics) {
    Method externalAccumsMethod;
    try {
      externalAccumsMethod = TaskMetrics.class.getDeclaredMethod("externalAccums");
      externalAccumsMethod.setAccessible(true);
      String returnType = externalAccumsMethod.getReturnType().getName();
      if (returnType.equals("scala.collection.mutable.Buffer")) {
        return ((Buffer>) externalAccumsMethod.invoke(taskMetrics))
            .lastOption();
      } else if (returnType.equals("scala.collection.Seq")) {
        return ((Seq>) externalAccumsMethod.invoke(taskMetrics)).lastOption();
      } else {
        return Option.apply(null); // None
      }
    } catch (NoSuchMethodException | InvocationTargetException | IllegalAccessException e) {
      return Option.apply(null); // None
    }
  }
}