org.apache.comet.parquet.ColumnReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of comet-spark-spark3.3_2.13
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.comet.parquet;

import java.io.IOException;
import java.nio.ByteBuffer;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.comet.shaded.arrow.c.ArrowArray;
import org.apache.comet.shaded.arrow.c.ArrowSchema;
import org.apache.comet.shaded.arrow.c.CometSchemaImporter;
import org.apache.comet.shaded.arrow.vector.FieldVector;
import org.apache.comet.shaded.arrow.vector.dictionary.Dictionary;
import org.apache.comet.shaded.arrow.vector.types.pojo.DictionaryEncoding;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.page.DataPage;
import org.apache.parquet.column.page.DataPageV1;
import org.apache.parquet.column.page.DataPageV2;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.page.PageReader;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.spark.sql.types.DataType;

import org.apache.comet.CometConf;
import org.apache.comet.vector.CometDecodedVector;
import org.apache.comet.vector.CometDictionary;
import org.apache.comet.vector.CometDictionaryVector;
import org.apache.comet.vector.CometPlainVector;
import org.apache.comet.vector.CometVector;

public class ColumnReader extends AbstractColumnReader {
  protected static final Logger LOG = LoggerFactory.getLogger(ColumnReader.class);

  /**
   * The current Comet vector holding all the values read by this column reader. Owned by this
   * reader and MUST be closed after use.
   */
  private CometDecodedVector currentVector;

  /** Dictionary values for this column. Only set if the column is using dictionary encoding. */
  protected CometDictionary dictionary;

  /** Reader for dictionary & data pages in the current column chunk. */
  protected PageReader pageReader;

  /** Whether the first data page has been loaded. */
  private boolean firstPageLoaded = false;

  /**
   * The number of nulls in the current batch, used when we are skipping importing of Arrow vectors,
   * in which case we'll simply update the null count of the existing vectors.
   */
  int currentNumNulls;

  /**
   * The number of values in the current batch, used when we are skipping importing of Arrow
   * vectors, in which case we'll simply update the null count of the existing vectors.
   */
  int currentNumValues;

  /**
   * Whether the last loaded vector contains any null value. This is used to determine if we can
   * skip vector reloading. If the flag is false, Arrow C API will skip to import the validity
   * buffer, and therefore we cannot skip vector reloading.
   */
  boolean hadNull;

  private final CometSchemaImporter importer;

  public ColumnReader(
      DataType type,
      ColumnDescriptor descriptor,
      CometSchemaImporter importer,
      int batchSize,
      boolean useDecimal128,
      boolean useLegacyDateTimestamp) {
    super(type, descriptor, useDecimal128, useLegacyDateTimestamp);
    assert batchSize > 0 : "Batch size must be positive, found " + batchSize;
    this.batchSize = batchSize;
    this.importer = importer;
    initNative();
  }

  /**
   * Set the page reader for a new column chunk to read. Expects to call `readBatch` after this.
   *
   * @param pageReader the page reader for the new column chunk
   */
  public void setPageReader(PageReader pageReader) throws IOException {
    this.pageReader = pageReader;

    DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
    if (dictionaryPage != null) {
      LOG.debug("dictionary page encoding = {}", dictionaryPage.getEncoding());
      Native.setDictionaryPage(
          nativeHandle,
          dictionaryPage.getDictionarySize(),
          dictionaryPage.getBytes().toByteArray(),
          dictionaryPage.getEncoding().ordinal());
    }
  }

  @Override
  public void readBatch(int total) {
    LOG.debug("Start to batch of size = " + total);

    if (!firstPageLoaded) {
      readPage();
      firstPageLoaded = true;
    }

    // Now first reset the current columnar batch so that it can be used to fill in a new batch
    // of values. Then, keep reading more data pages (via 'readBatch') until the current batch is
    // full, or we have read 'total' number of values.
    Native.resetBatch(nativeHandle);

    int left = total, nullsRead = 0;
    while (left > 0) {
      int[] array = Native.readBatch(nativeHandle, left);
      int valuesRead = array[0];
      nullsRead += array[1];
      if (valuesRead < left) {
        readPage();
      }
      left -= valuesRead;
    }

    this.currentNumValues = total;
    this.currentNumNulls = nullsRead;
  }

  /** Returns the {@link CometVector} read by this reader. */
  @Override
  public CometVector currentBatch() {
    return loadVector();
  }

  @Override
  public void close() {
    if (currentVector != null) {
      currentVector.close();
      currentVector = null;
    }
    super.close();
  }

  /** Returns a decoded {@link CometDecodedVector Comet vector}. */
  public CometDecodedVector loadVector() {
    // Only re-use Comet vector iff:
    //   1. if we're not using dictionary encoding, since with dictionary encoding, the native
    //      side may fallback to plain encoding and the underlying memory address for the vector
    //      will change as result.
    //   2. if the column type is of fixed width, in other words, string/binary are not supported
    //      since the native side may resize the vector and therefore change memory address.
    //   3. if the last loaded vector contains null values: if values of last vector are all not
    //      null, Arrow C data API will skip loading the native validity buffer, therefore we
    //      should not re-use the vector in that case.
    //   4. if the last loaded vector doesn't contain any null value, but the current vector also
    //      are all not null, which means we can also re-use the loaded vector.
    //   5. if the new number of value is the same or smaller
    if ((hadNull || currentNumNulls == 0)
        && currentVector != null
        && dictionary == null
        && currentVector.isFixedLength()
        && currentVector.numValues() >= currentNumValues) {
      currentVector.setNumNulls(currentNumNulls);
      currentVector.setNumValues(currentNumValues);
      return currentVector;
    }

    LOG.debug("Reloading vector");

    // Close the previous vector first to release struct memory allocated to import Arrow array &
    // schema from native side, through the C data interface
    if (currentVector != null) {
      currentVector.close();
    }

    LogicalTypeAnnotation logicalTypeAnnotation =
        descriptor.getPrimitiveType().getLogicalTypeAnnotation();
    boolean isUuid =
        logicalTypeAnnotation instanceof LogicalTypeAnnotation.UUIDLogicalTypeAnnotation;

    long[] addresses = Native.currentBatch(nativeHandle);

    try (ArrowArray array = ArrowArray.wrap(addresses[0]);
        ArrowSchema schema = ArrowSchema.wrap(addresses[1])) {
      FieldVector vector = importer.importVector(array, schema);

      DictionaryEncoding dictionaryEncoding = vector.getField().getDictionary();

      CometPlainVector cometVector = new CometPlainVector(vector, useDecimal128);

      // Update whether the current vector contains any null values. This is used in the following
      // batch(s) to determine whether we can skip loading the native vector.
      hadNull = cometVector.hasNull();

      if (dictionaryEncoding == null) {
        if (dictionary != null) {
          // This means the column was using dictionary encoding but now has fall-back to plain
          // encoding, on the native side. Setting 'dictionary' to null here, so we can use it as
          // a condition to check if we can re-use vector later.
          dictionary = null;
        }
        // Either the column is not dictionary encoded, or it was using dictionary encoding but
        // a new data page has switched back to use plain encoding. For both cases we should
        // return plain vector.
        currentVector = cometVector;
        return currentVector;
      }

      // We should already re-initiate `CometDictionary` here because `Data.importVector` API will
      // release the previous dictionary vector and create a new one.
      Dictionary arrowDictionary = importer.getProvider().lookup(dictionaryEncoding.getId());
      CometPlainVector dictionaryVector =
          new CometPlainVector(arrowDictionary.getVector(), useDecimal128, isUuid);
      if (dictionary != null) {
        dictionary.setDictionaryVector(dictionaryVector);
      } else {
        dictionary = new CometDictionary(dictionaryVector);
      }

      currentVector =
          new CometDictionaryVector(
              cometVector, dictionary, importer.getProvider(), useDecimal128, false, isUuid);

      currentVector =
          new CometDictionaryVector(cometVector, dictionary, importer.getProvider(), useDecimal128);
      return currentVector;
    }
  }

  protected void readPage() {
    DataPage page = pageReader.readPage();
    if (page == null) {
      throw new RuntimeException("overreading: returned DataPage is null");
    }
    ;
    int pageValueCount = page.getValueCount();
    page.accept(
        new DataPage.Visitor() {
          @Override
          public Void visit(DataPageV1 dataPageV1) {
            LOG.debug("data page encoding = {}", dataPageV1.getValueEncoding());
            if (dataPageV1.getDlEncoding() != Encoding.RLE
                && descriptor.getMaxDefinitionLevel() != 0) {
              throw new UnsupportedOperationException(
                  "Unsupported encoding: " + dataPageV1.getDlEncoding());
            }
            if (!isValidValueEncoding(dataPageV1.getValueEncoding())) {
              throw new UnsupportedOperationException(
                  "Unsupported value encoding: " + dataPageV1.getValueEncoding());
            }
            try {
              boolean useDirectBuffer =
                  (Boolean) CometConf.COMET_PARQUET_ENABLE_DIRECT_BUFFER().get();
              if (useDirectBuffer) {
                ByteBuffer buffer = dataPageV1.getBytes().toByteBuffer();
                Native.setPageBufferV1(
                    nativeHandle, pageValueCount, buffer, dataPageV1.getValueEncoding().ordinal());
              } else {
                byte[] array = dataPageV1.getBytes().toByteArray();
                Native.setPageV1(
                    nativeHandle, pageValueCount, array, dataPageV1.getValueEncoding().ordinal());
              }
            } catch (IOException e) {
              throw new RuntimeException(e);
            }
            return null;
          }

          @Override
          public Void visit(DataPageV2 dataPageV2) {
            if (!isValidValueEncoding(dataPageV2.getDataEncoding())) {
              throw new UnsupportedOperationException(
                  "Unsupported encoding: " + dataPageV2.getDataEncoding());
            }
            try {
              Native.setPageV2(
                  nativeHandle,
                  pageValueCount,
                  dataPageV2.getDefinitionLevels().toByteArray(),
                  dataPageV2.getRepetitionLevels().toByteArray(),
                  dataPageV2.getData().toByteArray(),
                  dataPageV2.getDataEncoding().ordinal());
            } catch (IOException e) {
              throw new RuntimeException(e);
            }
            return null;
          }
        });
  }

  @SuppressWarnings("deprecation")
  private boolean isValidValueEncoding(Encoding encoding) {
    switch (encoding) {
      case PLAIN:
      case RLE_DICTIONARY:
      case PLAIN_DICTIONARY:
        return true;
      default:
        return false;
    }
  }
}