org.apache.comet.parquet.DictionaryPageReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of comet-spark-spark3.4_2.12
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.comet.parquet;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.parquet.ParquetReadOptions;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.page.DictionaryPageReadStore;
import org.apache.parquet.compression.CompressionCodecFactory;
import org.apache.parquet.crypto.AesCipher;
import org.apache.parquet.crypto.InternalColumnDecryptionSetup;
import org.apache.parquet.crypto.InternalFileDecryptor;
import org.apache.parquet.crypto.ModuleCipherFactory;
import org.apache.parquet.format.BlockCipher;
import org.apache.parquet.format.DictionaryPageHeader;
import org.apache.parquet.format.PageHeader;
import org.apache.parquet.format.Util;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.io.SeekableInputStream;

public class DictionaryPageReader implements DictionaryPageReadStore {
  private final Map> cache;
  private final InternalFileDecryptor fileDecryptor;
  private final SeekableInputStream inputStream;
  private final ParquetReadOptions options;
  private final Map columns;

  DictionaryPageReader(
      BlockMetaData block,
      InternalFileDecryptor fileDecryptor,
      SeekableInputStream inputStream,
      ParquetReadOptions options) {
    this.columns = new HashMap<>();
    this.cache = new ConcurrentHashMap<>();
    this.fileDecryptor = fileDecryptor;
    this.inputStream = inputStream;
    this.options = options;

    for (ColumnChunkMetaData column : block.getColumns()) {
      columns.put(column.getPath().toDotString(), column);
    }
  }

  @Override
  public DictionaryPage readDictionaryPage(ColumnDescriptor descriptor) {
    String dotPath = String.join(".", descriptor.getPath());
    ColumnChunkMetaData column = columns.get(dotPath);

    if (column == null) {
      throw new ParquetDecodingException("Failed to load dictionary, unknown column: " + dotPath);
    }

    return cache
        .computeIfAbsent(
            dotPath,
            key -> {
              try {
                final DictionaryPage dict =
                    column.hasDictionaryPage() ? readDictionary(column) : null;

                // Copy the dictionary to ensure it can be reused if it is returned
                // more than once. This can happen when a DictionaryFilter has two or
                // more predicates for the same column. Cache misses as well.
                return (dict != null) ? Optional.of(reusableCopy(dict)) : Optional.empty();
              } catch (IOException e) {
                throw new ParquetDecodingException("Failed to read dictionary", e);
              }
            })
        .orElse(null);
  }

  DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
    if (!meta.hasDictionaryPage()) {
      return null;
    }

    if (inputStream.getPos() != meta.getStartingPos()) {
      inputStream.seek(meta.getStartingPos());
    }

    boolean encryptedColumn = false;
    InternalColumnDecryptionSetup columnDecryptionSetup = null;
    byte[] dictionaryPageAAD = null;
    BlockCipher.Decryptor pageDecryptor = null;
    if (null != fileDecryptor && !fileDecryptor.plaintextFile()) {
      columnDecryptionSetup = fileDecryptor.getColumnSetup(meta.getPath());
      if (columnDecryptionSetup.isEncrypted()) {
        encryptedColumn = true;
      }
    }

    PageHeader pageHeader;
    if (!encryptedColumn) {
      pageHeader = Util.readPageHeader(inputStream);
    } else {
      byte[] dictionaryPageHeaderAAD =
          AesCipher.createModuleAAD(
              fileDecryptor.getFileAAD(),
              ModuleCipherFactory.ModuleType.DictionaryPageHeader,
              meta.getRowGroupOrdinal(),
              columnDecryptionSetup.getOrdinal(),
              -1);
      pageHeader =
          Util.readPageHeader(
              inputStream, columnDecryptionSetup.getMetaDataDecryptor(), dictionaryPageHeaderAAD);
      dictionaryPageAAD =
          AesCipher.createModuleAAD(
              fileDecryptor.getFileAAD(),
              ModuleCipherFactory.ModuleType.DictionaryPage,
              meta.getRowGroupOrdinal(),
              columnDecryptionSetup.getOrdinal(),
              -1);
      pageDecryptor = columnDecryptionSetup.getDataDecryptor();
    }

    if (!pageHeader.isSetDictionary_page_header()) {
      return null;
    }

    DictionaryPage compressedPage =
        readCompressedDictionary(pageHeader, inputStream, pageDecryptor, dictionaryPageAAD);
    CompressionCodecFactory.BytesInputDecompressor decompressor =
        options.getCodecFactory().getDecompressor(meta.getCodec());

    return new DictionaryPage(
        decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
        compressedPage.getDictionarySize(),
        compressedPage.getEncoding());
  }

  private DictionaryPage readCompressedDictionary(
      PageHeader pageHeader,
      SeekableInputStream fin,
      BlockCipher.Decryptor pageDecryptor,
      byte[] dictionaryPageAAD)
      throws IOException {
    DictionaryPageHeader dictHeader = pageHeader.getDictionary_page_header();

    int uncompressedPageSize = pageHeader.getUncompressed_page_size();
    int compressedPageSize = pageHeader.getCompressed_page_size();

    byte[] dictPageBytes = new byte[compressedPageSize];
    fin.readFully(dictPageBytes);

    BytesInput bin = BytesInput.from(dictPageBytes);

    if (null != pageDecryptor) {
      bin = BytesInput.from(pageDecryptor.decrypt(bin.toByteArray(), dictionaryPageAAD));
    }

    return new DictionaryPage(
        bin,
        uncompressedPageSize,
        dictHeader.getNum_values(),
        org.apache.parquet.column.Encoding.valueOf(dictHeader.getEncoding().name()));
  }

  private static DictionaryPage reusableCopy(DictionaryPage dict) throws IOException {
    return new DictionaryPage(
        BytesInput.from(dict.getBytes().toByteArray()),
        dict.getDictionarySize(),
        dict.getEncoding());
  }
}