All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.hadoop.ParquetFileWriter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.hadoop;

import static org.apache.parquet.format.Util.writeFileCryptoMetaData;
import static org.apache.parquet.format.Util.writeFileMetaData;
import static org.apache.parquet.format.converter.ParquetMetadataConverter.MAX_STATS_SIZE;
import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.zip.CRC32;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.ParquetSizeOverflowException;
import org.apache.parquet.Preconditions;
import org.apache.parquet.Version;
import org.apache.parquet.bytes.ByteBufferAllocator;
import org.apache.parquet.bytes.ByteBufferReleaser;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.bytes.BytesUtils;
import org.apache.parquet.bytes.HeapByteBufferAllocator;
import org.apache.parquet.bytes.ReusingByteBufferAllocator;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.EncodingStats;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.statistics.SizeStatistics;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
import org.apache.parquet.crypto.AesCipher;
import org.apache.parquet.crypto.ColumnEncryptionProperties;
import org.apache.parquet.crypto.FileEncryptionProperties;
import org.apache.parquet.crypto.InternalColumnEncryptionSetup;
import org.apache.parquet.crypto.InternalFileEncryptor;
import org.apache.parquet.crypto.ModuleCipherFactory;
import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType;
import org.apache.parquet.crypto.ParquetCryptoRuntimeException;
import org.apache.parquet.format.BlockCipher;
import org.apache.parquet.format.Util;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.hadoop.metadata.GlobalMetaData;
import org.apache.parquet.hadoop.metadata.KeyValueMetadataMergeStrategy;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.metadata.StrictKeyValueMetadataMergeStrategy;
import org.apache.parquet.hadoop.util.HadoopOutputFile;
import org.apache.parquet.hadoop.util.HadoopStreams;
import org.apache.parquet.internal.column.columnindex.ColumnIndex;
import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder;
import org.apache.parquet.internal.column.columnindex.OffsetIndex;
import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder;
import org.apache.parquet.internal.hadoop.metadata.IndexReference;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.io.OutputFile;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.io.PositionOutputStream;
import org.apache.parquet.io.SeekableInputStream;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.TypeUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Internal implementation of the Parquet file writer as a block container
 */
public class ParquetFileWriter implements AutoCloseable {
  private static final Logger LOG = LoggerFactory.getLogger(ParquetFileWriter.class);

  private final ParquetMetadataConverter metadataConverter;

  public static final String PARQUET_METADATA_FILE = "_metadata";
  public static final String MAGIC_STR = "PAR1";
  public static final byte[] MAGIC = MAGIC_STR.getBytes(StandardCharsets.US_ASCII);
  public static final String EF_MAGIC_STR = "PARE";
  public static final byte[] EFMAGIC = EF_MAGIC_STR.getBytes(StandardCharsets.US_ASCII);
  public static final String PARQUET_COMMON_METADATA_FILE = "_common_metadata";
  public static final int CURRENT_VERSION = 1;

  // File creation modes
  public static enum Mode {
    CREATE,
    OVERWRITE
  }

  protected final PositionOutputStream out;

  private final MessageType schema;
  private final AlignmentStrategy alignment;
  private final int columnIndexTruncateLength;

  // file data
  private final List blocks = new ArrayList();

  // The column/offset indexes per blocks per column chunks
  private final List> columnIndexes = new ArrayList<>();
  private final List> offsetIndexes = new ArrayList<>();

  // The Bloom filters
  private final List> bloomFilters = new ArrayList<>();

  // The file encryptor
  private final InternalFileEncryptor fileEncryptor;

  // row group data
  private BlockMetaData currentBlock; // appended to by endColumn

  // The column/offset indexes for the actual block
  private List currentColumnIndexes;
  private List currentOffsetIndexes;

  // The Bloom filter for the actual block
  private Map currentBloomFilters;

  // row group data set at the start of a row group
  private long currentRecordCount; // set in startBlock

  // column chunk data accumulated as pages are written
  private final EncodingStats.Builder encodingStatsBuilder;
  private Set currentEncodings;
  private long uncompressedLength;
  private long compressedLength;
  private Statistics currentStatistics; // accumulated in writePage(s)
  private SizeStatistics currentSizeStatistics; // accumulated in writePage(s)
  private ColumnIndexBuilder columnIndexBuilder;
  private OffsetIndexBuilder offsetIndexBuilder;

  // column chunk data set at the start of a column
  private CompressionCodecName currentChunkCodec; // set in startColumn
  private ColumnPath currentChunkPath; // set in startColumn
  private PrimitiveType currentChunkType; // set in startColumn
  private long currentChunkValueCount; // set in startColumn
  private long currentChunkFirstDataPage; // set in startColumn & page writes
  private long currentChunkDictionaryPageOffset; // set in writeDictionaryPage

  // set when end is called
  private ParquetMetadata footer = null;
  private boolean closed;

  private final CRC32 crc;
  private final ReusingByteBufferAllocator crcAllocator;
  private final boolean pageWriteChecksumEnabled;

  /**
   * Captures the order in which methods should be called
   */
  private enum STATE {
    NOT_STARTED {
      STATE start() {
        return STARTED;
      }
    },
    STARTED {
      STATE startBlock() {
        return BLOCK;
      }

      STATE end() {
        return ENDED;
      }
    },
    BLOCK {
      STATE startColumn() {
        return COLUMN;
      }

      STATE endBlock() {
        return STARTED;
      }
    },
    COLUMN {
      STATE endColumn() {
        return BLOCK;
      }
      ;

      STATE write() {
        return this;
      }
    },
    ENDED;

    STATE start() throws IOException {
      return error();
    }

    STATE startBlock() throws IOException {
      return error();
    }

    STATE startColumn() throws IOException {
      return error();
    }

    STATE write() throws IOException {
      return error();
    }

    STATE endColumn() throws IOException {
      return error();
    }

    STATE endBlock() throws IOException {
      return error();
    }

    STATE end() throws IOException {
      return error();
    }

    private final STATE error() throws IOException {
      throw new IOException(
          "The file being written is in an invalid state. Probably caused by an error thrown previously. Current state: "
              + this.name());
    }
  }

  private STATE state = STATE.NOT_STARTED;

  /**
   * @param configuration Hadoop configuration
   * @param schema        the schema of the data
   * @param file          the file to write to
   * @throws IOException if the file can not be created
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public ParquetFileWriter(Configuration configuration, MessageType schema, Path file) throws IOException {
    this(
        HadoopOutputFile.fromPath(file, configuration),
        schema,
        Mode.CREATE,
        DEFAULT_BLOCK_SIZE,
        MAX_PADDING_SIZE_DEFAULT);
  }

  /**
   * @param configuration Hadoop configuration
   * @param schema        the schema of the data
   * @param file          the file to write to
   * @param mode          file creation mode
   * @throws IOException if the file can not be created
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public ParquetFileWriter(Configuration configuration, MessageType schema, Path file, Mode mode) throws IOException {
    this(
        HadoopOutputFile.fromPath(file, configuration),
        schema,
        mode,
        DEFAULT_BLOCK_SIZE,
        MAX_PADDING_SIZE_DEFAULT);
  }

  /**
   * @param configuration  Hadoop configuration
   * @param schema         the schema of the data
   * @param file           the file to write to
   * @param mode           file creation mode
   * @param rowGroupSize   the row group size
   * @param maxPaddingSize the maximum padding
   * @throws IOException if the file can not be created
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public ParquetFileWriter(
      Configuration configuration,
      MessageType schema,
      Path file,
      Mode mode,
      long rowGroupSize,
      int maxPaddingSize)
      throws IOException {
    this(HadoopOutputFile.fromPath(file, configuration), schema, mode, rowGroupSize, maxPaddingSize);
  }

  /**
   * @param file           OutputFile to create or overwrite
   * @param schema         the schema of the data
   * @param mode           file creation mode
   * @param rowGroupSize   the row group size
   * @param maxPaddingSize the maximum padding
   * @throws IOException if the file can not be created
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public ParquetFileWriter(OutputFile file, MessageType schema, Mode mode, long rowGroupSize, int maxPaddingSize)
      throws IOException {
    this(
        file,
        schema,
        mode,
        rowGroupSize,
        maxPaddingSize,
        ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
        ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH,
        ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED);
  }

  /**
   * @param file                      OutputFile to create or overwrite
   * @param schema                    the schema of the data
   * @param mode                      file creation mode
   * @param rowGroupSize              the row group size
   * @param maxPaddingSize            the maximum padding
   * @param columnIndexTruncateLength the length which the min/max values in column indexes tried to be truncated to
   * @param statisticsTruncateLength  the length which the min/max values in row groups tried to be truncated to
   * @param pageWriteChecksumEnabled  whether to write out page level checksums
   * @throws IOException if the file can not be created
   */
  public ParquetFileWriter(
      OutputFile file,
      MessageType schema,
      Mode mode,
      long rowGroupSize,
      int maxPaddingSize,
      int columnIndexTruncateLength,
      int statisticsTruncateLength,
      boolean pageWriteChecksumEnabled)
      throws IOException {
    this(
        file,
        schema,
        mode,
        rowGroupSize,
        maxPaddingSize,
        columnIndexTruncateLength,
        statisticsTruncateLength,
        pageWriteChecksumEnabled,
        null,
        null,
        null);
  }

  public ParquetFileWriter(
      OutputFile file,
      MessageType schema,
      Mode mode,
      long rowGroupSize,
      int maxPaddingSize,
      int columnIndexTruncateLength,
      int statisticsTruncateLength,
      boolean pageWriteChecksumEnabled,
      FileEncryptionProperties encryptionProperties)
      throws IOException {
    this(
        file,
        schema,
        mode,
        rowGroupSize,
        maxPaddingSize,
        columnIndexTruncateLength,
        statisticsTruncateLength,
        pageWriteChecksumEnabled,
        encryptionProperties,
        null,
        null);
  }

  public ParquetFileWriter(
      OutputFile file,
      MessageType schema,
      Mode mode,
      long rowGroupSize,
      int maxPaddingSize,
      FileEncryptionProperties encryptionProperties,
      ParquetProperties props)
      throws IOException {
    this(
        file,
        schema,
        mode,
        rowGroupSize,
        maxPaddingSize,
        props.getColumnIndexTruncateLength(),
        props.getStatisticsTruncateLength(),
        props.getPageWriteChecksumEnabled(),
        encryptionProperties,
        null,
        props.getAllocator());
  }

  @Deprecated
  public ParquetFileWriter(
      OutputFile file,
      MessageType schema,
      Mode mode,
      long rowGroupSize,
      int maxPaddingSize,
      int columnIndexTruncateLength,
      int statisticsTruncateLength,
      boolean pageWriteChecksumEnabled,
      InternalFileEncryptor encryptor)
      throws IOException {
    this(
        file,
        schema,
        mode,
        rowGroupSize,
        maxPaddingSize,
        columnIndexTruncateLength,
        statisticsTruncateLength,
        pageWriteChecksumEnabled,
        null,
        encryptor,
        null);
  }

  private ParquetFileWriter(
      OutputFile file,
      MessageType schema,
      Mode mode,
      long rowGroupSize,
      int maxPaddingSize,
      int columnIndexTruncateLength,
      int statisticsTruncateLength,
      boolean pageWriteChecksumEnabled,
      FileEncryptionProperties encryptionProperties,
      InternalFileEncryptor encryptor,
      ByteBufferAllocator allocator)
      throws IOException {
    TypeUtil.checkValidWriteSchema(schema);

    this.schema = schema;

    long blockSize = rowGroupSize;
    if (file.supportsBlockSize()) {
      blockSize = Math.max(file.defaultBlockSize(), rowGroupSize);
      this.alignment = PaddingAlignment.get(blockSize, rowGroupSize, maxPaddingSize);
    } else {
      this.alignment = NoAlignment.get(rowGroupSize);
    }

    if (mode == Mode.OVERWRITE) {
      this.out = file.createOrOverwrite(blockSize);
    } else {
      this.out = file.create(blockSize);
    }

    this.encodingStatsBuilder = new EncodingStats.Builder();
    this.columnIndexTruncateLength = columnIndexTruncateLength;
    this.pageWriteChecksumEnabled = pageWriteChecksumEnabled;
    this.crc = pageWriteChecksumEnabled ? new CRC32() : null;
    this.crcAllocator = pageWriteChecksumEnabled
        ? ReusingByteBufferAllocator.strict(allocator == null ? new HeapByteBufferAllocator() : allocator)
        : null;

    this.metadataConverter = new ParquetMetadataConverter(statisticsTruncateLength);

    if (null == encryptionProperties && null == encryptor) {
      this.fileEncryptor = null;
      return;
    }

    if (null == encryptionProperties) {
      encryptionProperties = encryptor.getEncryptionProperties();
    }

    // Verify that every encrypted column is in file schema
    Map columnEncryptionProperties =
        encryptionProperties.getEncryptedColumns();
    if (null != columnEncryptionProperties) { // if null, every column in file schema will be encrypted with footer
      // key
      for (Map.Entry entry : columnEncryptionProperties.entrySet()) {
        String[] path = entry.getKey().toArray();
        if (!schema.containsPath(path)) {
          StringBuilder columnList = new StringBuilder();
          columnList.append("[");
          for (String[] columnPath : schema.getPaths()) {
            columnList
                .append(ColumnPath.get(columnPath).toDotString())
                .append("], [");
          }
          throw new ParquetCryptoRuntimeException(
              "Encrypted column [" + entry.getKey().toDotString() + "] not in file schema column list: "
                  + columnList.substring(0, columnList.length() - 3));
        }
      }
    }

    if (null == encryptor) {
      this.fileEncryptor = new InternalFileEncryptor(encryptionProperties);
    } else {
      this.fileEncryptor = encryptor;
    }
  }

  /**
   * FOR TESTING ONLY. This supports testing block padding behavior on the local FS.
   *
   * @param configuration   Hadoop configuration
   * @param schema          the schema of the data
   * @param file            the file to write to
   * @param rowAndBlockSize the row group size
   * @param maxPaddingSize  the maximum padding
   * @param columnIndexTruncateLength  the length which the min/max values in column indexes tried to be truncated to
   * @param allocator       allocator to potentially allocate {@link java.nio.ByteBuffer} objects
   * @throws IOException if the file can not be created
   */
  ParquetFileWriter(
      Configuration configuration,
      MessageType schema,
      Path file,
      long rowAndBlockSize,
      int maxPaddingSize,
      int columnIndexTruncateLength,
      ByteBufferAllocator allocator)
      throws IOException {
    FileSystem fs = file.getFileSystem(configuration);
    this.schema = schema;
    this.alignment = PaddingAlignment.get(rowAndBlockSize, rowAndBlockSize, maxPaddingSize);
    this.out = HadoopStreams.wrap(fs.create(file, true, 8192, fs.getDefaultReplication(file), rowAndBlockSize));
    this.encodingStatsBuilder = new EncodingStats.Builder();
    this.columnIndexTruncateLength = columnIndexTruncateLength;
    this.pageWriteChecksumEnabled = ParquetOutputFormat.getPageWriteChecksumEnabled(configuration);
    this.crc = pageWriteChecksumEnabled ? new CRC32() : null;
    this.crcAllocator = pageWriteChecksumEnabled
        ? ReusingByteBufferAllocator.strict(allocator == null ? new HeapByteBufferAllocator() : allocator)
        : null;
    this.metadataConverter = new ParquetMetadataConverter(ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH);
    this.fileEncryptor = null;
  }

  /**
   * start the file
   *
   * @throws IOException if there is an error while writing
   */
  public void start() throws IOException {
    state = state.start();
    LOG.debug("{}: start", out.getPos());
    byte[] magic = MAGIC;
    if (null != fileEncryptor && fileEncryptor.isFooterEncrypted()) {
      magic = EFMAGIC;
    }
    out.write(magic);
  }

  public InternalFileEncryptor getEncryptor() {
    return fileEncryptor;
  }

  /**
   * start a block
   *
   * @param recordCount the record count in this block
   * @throws IOException if there is an error while writing
   */
  public void startBlock(long recordCount) throws IOException {
    state = state.startBlock();
    LOG.debug("{}: start block", out.getPos());
    //    out.write(MAGIC); // TODO: add a magic delimiter

    alignment.alignForRowGroup(out);

    currentBlock = new BlockMetaData();
    currentRecordCount = recordCount;

    currentColumnIndexes = new ArrayList<>();
    currentOffsetIndexes = new ArrayList<>();

    currentBloomFilters = new HashMap<>();
  }

  /**
   * start a column inside a block
   *
   * @param descriptor           the column descriptor
   * @param valueCount           the value count in this column
   * @param compressionCodecName a compression codec name
   * @throws IOException if there is an error while writing
   */
  public void startColumn(ColumnDescriptor descriptor, long valueCount, CompressionCodecName compressionCodecName)
      throws IOException {
    state = state.startColumn();
    encodingStatsBuilder.clear();
    currentEncodings = new HashSet();
    currentChunkPath = ColumnPath.get(descriptor.getPath());
    currentChunkType = descriptor.getPrimitiveType();
    currentChunkCodec = compressionCodecName;
    currentChunkValueCount = valueCount;
    currentChunkFirstDataPage = -1;
    compressedLength = 0;
    uncompressedLength = 0;
    // The statistics will be copied from the first one added at writeDataPage(s) so we have the correct typed one
    currentStatistics = null;
    currentSizeStatistics = SizeStatistics.newBuilder(
            descriptor.getPrimitiveType(),
            descriptor.getMaxRepetitionLevel(),
            descriptor.getMaxDefinitionLevel())
        .build();

    columnIndexBuilder = ColumnIndexBuilder.getBuilder(currentChunkType, columnIndexTruncateLength);
    offsetIndexBuilder = OffsetIndexBuilder.getBuilder();
  }

  /**
   * writes a dictionary page
   *
   * @param dictionaryPage the dictionary page
   * @throws IOException if there is an error while writing
   */
  public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
    writeDictionaryPage(dictionaryPage, null, null);
  }

  public void writeDictionaryPage(
      DictionaryPage dictionaryPage, BlockCipher.Encryptor headerBlockEncryptor, byte[] AAD) throws IOException {
    state = state.write();
    LOG.debug("{}: write dictionary page: {} values", out.getPos(), dictionaryPage.getDictionarySize());
    currentChunkDictionaryPageOffset = out.getPos();
    int uncompressedSize = dictionaryPage.getUncompressedSize();
    int compressedPageSize = Math.toIntExact(dictionaryPage.getBytes().size());
    if (pageWriteChecksumEnabled) {
      crc.reset();
      crcUpdate(dictionaryPage.getBytes());
      metadataConverter.writeDictionaryPageHeader(
          uncompressedSize,
          compressedPageSize,
          dictionaryPage.getDictionarySize(),
          dictionaryPage.getEncoding(),
          (int) crc.getValue(),
          out,
          headerBlockEncryptor,
          AAD);
    } else {
      metadataConverter.writeDictionaryPageHeader(
          uncompressedSize,
          compressedPageSize,
          dictionaryPage.getDictionarySize(),
          dictionaryPage.getEncoding(),
          out,
          headerBlockEncryptor,
          AAD);
    }
    long headerSize = out.getPos() - currentChunkDictionaryPageOffset;
    this.uncompressedLength += uncompressedSize + headerSize;
    this.compressedLength += compressedPageSize + headerSize;
    LOG.debug("{}: write dictionary page content {}", out.getPos(), compressedPageSize);
    dictionaryPage.getBytes().writeAllTo(out); // for encrypted column, dictionary page bytes are already encrypted
    encodingStatsBuilder.addDictEncoding(dictionaryPage.getEncoding());
    currentEncodings.add(dictionaryPage.getEncoding());
  }

  /**
   * writes a single page
   *
   * @param valueCount           count of values
   * @param uncompressedPageSize the size of the data once uncompressed
   * @param bytes                the compressed data for the page without header
   * @param rlEncoding           encoding of the repetition level
   * @param dlEncoding           encoding of the definition level
   * @param valuesEncoding       encoding of values
   * @throws IOException if there is an error while writing
   */
  @Deprecated
  public void writeDataPage(
      int valueCount,
      int uncompressedPageSize,
      BytesInput bytes,
      Encoding rlEncoding,
      Encoding dlEncoding,
      Encoding valuesEncoding)
      throws IOException {
    state = state.write();
    // We are unable to build indexes without rowCount so skip them for this column
    offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder();
    columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder();
    long beforeHeader = out.getPos();
    LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
    int compressedPageSize = toIntWithCheck(bytes.size(), "page");
    metadataConverter.writeDataPageV1Header(
        uncompressedPageSize, compressedPageSize, valueCount, rlEncoding, dlEncoding, valuesEncoding, out);
    long headerSize = out.getPos() - beforeHeader;
    this.uncompressedLength += uncompressedPageSize + headerSize;
    this.compressedLength += compressedPageSize + headerSize;
    LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize);
    bytes.writeAllTo(out);
    encodingStatsBuilder.addDataEncoding(valuesEncoding);
    currentEncodings.add(rlEncoding);
    currentEncodings.add(dlEncoding);
    currentEncodings.add(valuesEncoding);
    if (currentChunkFirstDataPage < 0) {
      currentChunkFirstDataPage = beforeHeader;
    }
  }

  /**
   * writes a single page
   *
   * @param valueCount           count of values
   * @param uncompressedPageSize the size of the data once uncompressed
   * @param bytes                the compressed data for the page without header
   * @param statistics           statistics for the page
   * @param rlEncoding           encoding of the repetition level
   * @param dlEncoding           encoding of the definition level
   * @param valuesEncoding       encoding of values
   * @throws IOException if there is an error while writing
   * @deprecated this method does not support writing column indexes; Use
   * {@link #writeDataPage(int, int, BytesInput, Statistics, long, Encoding, Encoding, Encoding)} instead
   */
  @Deprecated
  public void writeDataPage(
      int valueCount,
      int uncompressedPageSize,
      BytesInput bytes,
      Statistics statistics,
      Encoding rlEncoding,
      Encoding dlEncoding,
      Encoding valuesEncoding)
      throws IOException {
    // We are unable to build indexes without rowCount so skip them for this column
    offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder();
    columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder();
    innerWriteDataPage(
        valueCount,
        uncompressedPageSize,
        bytes,
        statistics,
        rlEncoding,
        dlEncoding,
        valuesEncoding,
        null,
        null,
        null);
  }

  /**
   * Writes a single page
   *
   * @param valueCount           count of values
   * @param uncompressedPageSize the size of the data once uncompressed
   * @param bytes                the compressed data for the page without header
   * @param statistics           the statistics of the page
   * @param rowCount             the number of rows in the page
   * @param rlEncoding           encoding of the repetition level
   * @param dlEncoding           encoding of the definition level
   * @param valuesEncoding       encoding of values
   * @throws IOException if any I/O error occurs during writing the file
   */
  public void writeDataPage(
      int valueCount,
      int uncompressedPageSize,
      BytesInput bytes,
      Statistics statistics,
      long rowCount,
      Encoding rlEncoding,
      Encoding dlEncoding,
      Encoding valuesEncoding)
      throws IOException {
    writeDataPage(
        valueCount,
        uncompressedPageSize,
        bytes,
        statistics,
        rowCount,
        rlEncoding,
        dlEncoding,
        valuesEncoding,
        null,
        null,
        null);
  }

  /**
   * Writes a single page
   * @param valueCount count of values
   * @param uncompressedPageSize the size of the data once uncompressed
   * @param bytes the compressed data for the page without header
   * @param statistics the statistics of the page
   * @param rowCount the number of rows in the page
   * @param rlEncoding encoding of the repetition level
   * @param dlEncoding encoding of the definition level
   * @param valuesEncoding encoding of values
   * @param metadataBlockEncryptor encryptor for block data
   * @param pageHeaderAAD pageHeader AAD
   * @throws IOException if any I/O error occurs during writing the file
   */
  public void writeDataPage(
      int valueCount,
      int uncompressedPageSize,
      BytesInput bytes,
      Statistics statistics,
      long rowCount,
      Encoding rlEncoding,
      Encoding dlEncoding,
      Encoding valuesEncoding,
      BlockCipher.Encryptor metadataBlockEncryptor,
      byte[] pageHeaderAAD)
      throws IOException {
    writeDataPage(
        valueCount,
        uncompressedPageSize,
        bytes,
        statistics,
        rowCount,
        rlEncoding,
        dlEncoding,
        valuesEncoding,
        metadataBlockEncryptor,
        pageHeaderAAD,
        null);
  }

  /**
   * Writes a single page
   *
   * @param valueCount             count of values
   * @param uncompressedPageSize   the size of the data once uncompressed
   * @param bytes                  the compressed data for the page without header
   * @param statistics             the statistics of the page
   * @param rowCount               the number of rows in the page
   * @param rlEncoding             encoding of the repetition level
   * @param dlEncoding             encoding of the definition level
   * @param valuesEncoding         encoding of values
   * @param metadataBlockEncryptor encryptor for block data
   * @param pageHeaderAAD pageHeader AAD
   * @param sizeStatistics size statistics for the page
   * @throws IOException if any I/O error occurs during writing the file
   */
  public void writeDataPage(
      int valueCount,
      int uncompressedPageSize,
      BytesInput bytes,
      Statistics statistics,
      long rowCount,
      Encoding rlEncoding,
      Encoding dlEncoding,
      Encoding valuesEncoding,
      BlockCipher.Encryptor metadataBlockEncryptor,
      byte[] pageHeaderAAD,
      SizeStatistics sizeStatistics)
      throws IOException {
    long beforeHeader = out.getPos();
    innerWriteDataPage(
        valueCount,
        uncompressedPageSize,
        bytes,
        statistics,
        rlEncoding,
        dlEncoding,
        valuesEncoding,
        metadataBlockEncryptor,
        pageHeaderAAD,
        sizeStatistics);
    offsetIndexBuilder.add(
        toIntWithCheck(out.getPos() - beforeHeader, "page"),
        rowCount,
        sizeStatistics != null ? sizeStatistics.getUnencodedByteArrayDataBytes() : Optional.empty());
  }

  private void innerWriteDataPage(
      int valueCount,
      int uncompressedPageSize,
      BytesInput bytes,
      Statistics statistics,
      Encoding rlEncoding,
      Encoding dlEncoding,
      Encoding valuesEncoding,
      BlockCipher.Encryptor metadataBlockEncryptor,
      byte[] pageHeaderAAD,
      SizeStatistics sizeStatistics)
      throws IOException {
    writeDataPage(
        valueCount,
        uncompressedPageSize,
        bytes,
        statistics,
        rlEncoding,
        dlEncoding,
        valuesEncoding,
        metadataBlockEncryptor,
        pageHeaderAAD,
        sizeStatistics);
  }

  /**
   * writes a single page
   *
   * @param valueCount             count of values
   * @param uncompressedPageSize   the size of the data once uncompressed
   * @param bytes                  the compressed data for the page without header
   * @param statistics             statistics for the page
   * @param rlEncoding             encoding of the repetition level
   * @param dlEncoding             encoding of the definition level
   * @param valuesEncoding         encoding of values
   * @param metadataBlockEncryptor encryptor for block data
   * @param pageHeaderAAD          pageHeader AAD
   * @throws IOException if there is an error while writing
   */
  public void writeDataPage(
      int valueCount,
      int uncompressedPageSize,
      BytesInput bytes,
      Statistics statistics,
      Encoding rlEncoding,
      Encoding dlEncoding,
      Encoding valuesEncoding,
      BlockCipher.Encryptor metadataBlockEncryptor,
      byte[] pageHeaderAAD)
      throws IOException {
    writeDataPage(
        valueCount,
        uncompressedPageSize,
        bytes,
        statistics,
        rlEncoding,
        dlEncoding,
        valuesEncoding,
        metadataBlockEncryptor,
        pageHeaderAAD,
        null);
  }

  /**
   * writes a single page
   * @param valueCount count of values
   * @param uncompressedPageSize the size of the data once uncompressed
   * @param bytes the compressed data for the page without header
   * @param statistics statistics for the page
   * @param rlEncoding encoding of the repetition level
   * @param dlEncoding encoding of the definition level
   * @param valuesEncoding encoding of values
   * @param metadataBlockEncryptor encryptor for block data
   * @param pageHeaderAAD pageHeader AAD
   * @param sizeStatistics size statistics for the page
   * @throws IOException if there is an error while writing
   */
  public void writeDataPage(
      int valueCount,
      int uncompressedPageSize,
      BytesInput bytes,
      Statistics statistics,
      Encoding rlEncoding,
      Encoding dlEncoding,
      Encoding valuesEncoding,
      BlockCipher.Encryptor metadataBlockEncryptor,
      byte[] pageHeaderAAD,
      SizeStatistics sizeStatistics)
      throws IOException {
    state = state.write();
    long beforeHeader = out.getPos();
    if (currentChunkFirstDataPage < 0) {
      currentChunkFirstDataPage = beforeHeader;
    }
    LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
    int compressedPageSize = toIntWithCheck(bytes.size(), "page");
    if (pageWriteChecksumEnabled) {
      crc.reset();
      crcUpdate(bytes);
      metadataConverter.writeDataPageV1Header(
          uncompressedPageSize,
          compressedPageSize,
          valueCount,
          rlEncoding,
          dlEncoding,
          valuesEncoding,
          (int) crc.getValue(),
          out,
          metadataBlockEncryptor,
          pageHeaderAAD);
    } else {
      metadataConverter.writeDataPageV1Header(
          uncompressedPageSize,
          compressedPageSize,
          valueCount,
          rlEncoding,
          dlEncoding,
          valuesEncoding,
          out,
          metadataBlockEncryptor,
          pageHeaderAAD);
    }
    long headerSize = out.getPos() - beforeHeader;
    this.uncompressedLength += uncompressedPageSize + headerSize;
    this.compressedLength += compressedPageSize + headerSize;
    LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize);
    bytes.writeAllTo(out);

    mergeColumnStatistics(statistics, sizeStatistics);

    encodingStatsBuilder.addDataEncoding(valuesEncoding);
    currentEncodings.add(rlEncoding);
    currentEncodings.add(dlEncoding);
    currentEncodings.add(valuesEncoding);
  }

  /**
   * Add a Bloom filter that will be written out.
   *
   * @param column      the column name
   * @param bloomFilter the bloom filter of column values
   */
  public void addBloomFilter(String column, BloomFilter bloomFilter) {
    currentBloomFilters.put(column, bloomFilter);
  }

  /**
   * Writes a single v2 data page
   *
   * @param rowCount             count of rows
   * @param nullCount            count of nulls
   * @param valueCount           count of values
   * @param repetitionLevels     repetition level bytes
   * @param definitionLevels     definition level bytes
   * @param dataEncoding         encoding for data
   * @param compressedData       compressed data bytes
   * @param uncompressedDataSize the size of uncompressed data
   * @param statistics           the statistics of the page
   * @throws IOException if any I/O error occurs during writing the file
   */
  public void writeDataPageV2(
      int rowCount,
      int nullCount,
      int valueCount,
      BytesInput repetitionLevels,
      BytesInput definitionLevels,
      Encoding dataEncoding,
      BytesInput compressedData,
      int uncompressedDataSize,
      Statistics statistics)
      throws IOException {
    writeDataPageV2(
        rowCount,
        nullCount,
        valueCount,
        repetitionLevels,
        definitionLevels,
        dataEncoding,
        compressedData,
        uncompressedDataSize,
        statistics,
        null,
        null,
        null);
  }

  /**
   * Writes a single v2 data page
   *
   * @param rowCount               count of rows
   * @param nullCount              count of nulls
   * @param valueCount             count of values
   * @param repetitionLevels       repetition level bytes
   * @param definitionLevels       definition level bytes
   * @param dataEncoding           encoding for data
   * @param compressedData         compressed data bytes
   * @param uncompressedDataSize   the size of uncompressed data
   * @param statistics             the statistics of the page
   * @param metadataBlockEncryptor encryptor for block data
   * @param pageHeaderAAD          pageHeader AAD
   * @throws IOException if any I/O error occurs during writing the file
   */
  public void writeDataPageV2(
      int rowCount,
      int nullCount,
      int valueCount,
      BytesInput repetitionLevels,
      BytesInput definitionLevels,
      Encoding dataEncoding,
      BytesInput compressedData,
      int uncompressedDataSize,
      Statistics statistics,
      BlockCipher.Encryptor metadataBlockEncryptor,
      byte[] pageHeaderAAD)
      throws IOException {
    writeDataPageV2(
        rowCount,
        nullCount,
        valueCount,
        repetitionLevels,
        definitionLevels,
        dataEncoding,
        compressedData,
        uncompressedDataSize,
        statistics,
        metadataBlockEncryptor,
        pageHeaderAAD,
        null);
  }

  /**
   * Writes a single v2 data page
   *
   * @param rowCount count of rows
   * @param nullCount count of nulls
   * @param valueCount count of values
   * @param repetitionLevels repetition level bytes
   * @param definitionLevels definition level bytes
   * @param dataEncoding encoding for data
   * @param compressedData compressed data bytes
   * @param uncompressedDataSize the size of uncompressed data
   * @param statistics the statistics of the page
   * @param metadataBlockEncryptor encryptor for block data
   * @param pageHeaderAAD pageHeader AAD
   * @param sizeStatistics size statistics for the page
   * @throws IOException if any I/O error occurs during writing the file
   */
  public void writeDataPageV2(
      int rowCount,
      int nullCount,
      int valueCount,
      BytesInput repetitionLevels,
      BytesInput definitionLevels,
      Encoding dataEncoding,
      BytesInput compressedData,
      int uncompressedDataSize,
      Statistics statistics,
      BlockCipher.Encryptor metadataBlockEncryptor,
      byte[] pageHeaderAAD,
      SizeStatistics sizeStatistics)
      throws IOException {
    state = state.write();
    int rlByteLength = toIntWithCheck(repetitionLevels.size(), "page repetition levels");
    int dlByteLength = toIntWithCheck(definitionLevels.size(), "page definition levels");

    int compressedSize =
        toIntWithCheck(compressedData.size() + repetitionLevels.size() + definitionLevels.size(), "page");

    int uncompressedSize =
        toIntWithCheck(uncompressedDataSize + repetitionLevels.size() + definitionLevels.size(), "page");

    long beforeHeader = out.getPos();
    if (currentChunkFirstDataPage < 0) {
      currentChunkFirstDataPage = beforeHeader;
    }

    if (pageWriteChecksumEnabled) {
      crc.reset();
      if (repetitionLevels.size() > 0) {
        crcUpdate(repetitionLevels);
      }
      if (definitionLevels.size() > 0) {
        crcUpdate(definitionLevels);
      }
      if (compressedData.size() > 0) {
        crcUpdate(compressedData);
      }
      metadataConverter.writeDataPageV2Header(
          uncompressedSize,
          compressedSize,
          valueCount,
          nullCount,
          rowCount,
          dataEncoding,
          rlByteLength,
          dlByteLength,
          (int) crc.getValue(),
          out,
          metadataBlockEncryptor,
          pageHeaderAAD);
    } else {
      metadataConverter.writeDataPageV2Header(
          uncompressedSize,
          compressedSize,
          valueCount,
          nullCount,
          rowCount,
          dataEncoding,
          rlByteLength,
          dlByteLength,
          out,
          metadataBlockEncryptor,
          pageHeaderAAD);
    }

    long headersSize = out.getPos() - beforeHeader;
    this.uncompressedLength += uncompressedSize + headersSize;
    this.compressedLength += compressedSize + headersSize;

    mergeColumnStatistics(statistics, sizeStatistics);

    currentEncodings.add(dataEncoding);
    encodingStatsBuilder.addDataEncoding(dataEncoding);

    BytesInput.concat(repetitionLevels, definitionLevels, compressedData).writeAllTo(out);

    offsetIndexBuilder.add(
        toIntWithCheck(out.getPos() - beforeHeader, "page"),
        rowCount,
        sizeStatistics != null ? sizeStatistics.getUnencodedByteArrayDataBytes() : Optional.empty());
  }

  private void crcUpdate(BytesInput bytes) {
    try (ByteBufferReleaser releaser = crcAllocator.getReleaser()) {
      crc.update(bytes.toByteBuffer(releaser));
    }
  }

  /**
   * Writes a column chunk at once
   *
   * @param descriptor                the descriptor of the column
   * @param valueCount                the value count in this column
   * @param compressionCodecName      the name of the compression codec used for compressing the pages
   * @param dictionaryPage            the dictionary page for this column chunk (might be null)
   * @param bytes                     the encoded pages including page headers to be written as is
   * @param uncompressedTotalPageSize total uncompressed size (without page headers)
   * @param compressedTotalPageSize   total compressed size (without page headers)
   * @param totalStats                accumulated statistics for the column chunk
   * @param columnIndexBuilder        the builder object for the column index
   * @param offsetIndexBuilder        the builder object for the offset index
   * @param bloomFilter               the bloom filter for this column
   * @param rlEncodings               the RL encodings used in this column chunk
   * @param dlEncodings               the DL encodings used in this column chunk
   * @param dataEncodings             the data encodings used in this column chunk
   * @throws IOException if there is an error while writing
   */
  void writeColumnChunk(
      ColumnDescriptor descriptor,
      long valueCount,
      CompressionCodecName compressionCodecName,
      DictionaryPage dictionaryPage,
      BytesInput bytes,
      long uncompressedTotalPageSize,
      long compressedTotalPageSize,
      Statistics totalStats,
      SizeStatistics totalSizeStats,
      ColumnIndexBuilder columnIndexBuilder,
      OffsetIndexBuilder offsetIndexBuilder,
      BloomFilter bloomFilter,
      Set rlEncodings,
      Set dlEncodings,
      List dataEncodings)
      throws IOException {
    writeColumnChunk(
        descriptor,
        valueCount,
        compressionCodecName,
        dictionaryPage,
        bytes,
        uncompressedTotalPageSize,
        compressedTotalPageSize,
        totalStats,
        totalSizeStats,
        columnIndexBuilder,
        offsetIndexBuilder,
        bloomFilter,
        rlEncodings,
        dlEncodings,
        dataEncodings,
        null,
        0,
        0,
        null);
  }

  void writeColumnChunk(
      ColumnDescriptor descriptor,
      long valueCount,
      CompressionCodecName compressionCodecName,
      DictionaryPage dictionaryPage,
      BytesInput bytes,
      long uncompressedTotalPageSize,
      long compressedTotalPageSize,
      Statistics totalStats,
      SizeStatistics totalSizeStats,
      ColumnIndexBuilder columnIndexBuilder,
      OffsetIndexBuilder offsetIndexBuilder,
      BloomFilter bloomFilter,
      Set rlEncodings,
      Set dlEncodings,
      List dataEncodings,
      BlockCipher.Encryptor headerBlockEncryptor,
      int rowGroupOrdinal,
      int columnOrdinal,
      byte[] fileAAD)
      throws IOException {
    startColumn(descriptor, valueCount, compressionCodecName);

    state = state.write();
    if (dictionaryPage != null) {
      byte[] dictonaryPageHeaderAAD = null;
      if (null != headerBlockEncryptor) {
        dictonaryPageHeaderAAD = AesCipher.createModuleAAD(
            fileAAD, ModuleType.DictionaryPageHeader, rowGroupOrdinal, columnOrdinal, -1);
      }
      writeDictionaryPage(dictionaryPage, headerBlockEncryptor, dictonaryPageHeaderAAD);
    }

    if (bloomFilter != null) {
      // write bloom filter if one of data pages is not dictionary encoded
      boolean isWriteBloomFilter = false;
      for (Encoding encoding : dataEncodings) {
        // dictionary encoding: `PLAIN_DICTIONARY` is used in parquet v1, `RLE_DICTIONARY` is used in parquet v2
        if (encoding != Encoding.PLAIN_DICTIONARY && encoding != Encoding.RLE_DICTIONARY) {
          isWriteBloomFilter = true;
          break;
        }
      }
      if (isWriteBloomFilter) {
        currentBloomFilters.put(String.join(".", descriptor.getPath()), bloomFilter);
      } else {
        LOG.info(
            "No need to write bloom filter because column {} data pages are all encoded as dictionary.",
            descriptor.getPath());
      }
    }
    LOG.debug("{}: write data pages", out.getPos());
    long headersSize = bytes.size() - compressedTotalPageSize;
    this.uncompressedLength += uncompressedTotalPageSize + headersSize;
    this.compressedLength += compressedTotalPageSize + headersSize;
    LOG.debug("{}: write data pages content", out.getPos());
    currentChunkFirstDataPage = out.getPos();
    bytes.writeAllTo(out);
    encodingStatsBuilder.addDataEncodings(dataEncodings);
    if (rlEncodings.isEmpty()) {
      encodingStatsBuilder.withV2Pages();
    }
    currentEncodings.addAll(rlEncodings);
    currentEncodings.addAll(dlEncodings);
    currentEncodings.addAll(dataEncodings);
    currentStatistics = totalStats;
    currentSizeStatistics = totalSizeStats;

    this.columnIndexBuilder = columnIndexBuilder;
    this.offsetIndexBuilder = offsetIndexBuilder;

    endColumn();
  }

  /**
   * Overwrite the column total statistics. This special used when the column total statistics
   * is known while all the page statistics are invalid, for example when rewriting the column.
   *
   * @param totalStatistics the column total statistics
   */
  public void invalidateStatistics(Statistics totalStatistics) {
    Preconditions.checkArgument(totalStatistics != null, "Column total statistics can not be null");
    currentStatistics = totalStatistics;
    // Invalid the ColumnIndex
    columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder();
  }

  /**
   * end a column (once all rep, def and data have been written)
   *
   * @throws IOException if there is an error while writing
   */
  public void endColumn() throws IOException {
    state = state.endColumn();
    LOG.debug("{}: end column", out.getPos());
    if (columnIndexBuilder.getMinMaxSize() > columnIndexBuilder.getPageCount() * MAX_STATS_SIZE) {
      currentColumnIndexes.add(null);
    } else {
      currentColumnIndexes.add(columnIndexBuilder.build());
    }
    currentOffsetIndexes.add(offsetIndexBuilder.build(currentChunkFirstDataPage));
    currentBlock.addColumn(ColumnChunkMetaData.get(
        currentChunkPath,
        currentChunkType,
        currentChunkCodec,
        encodingStatsBuilder.build(),
        currentEncodings,
        currentStatistics,
        currentChunkFirstDataPage,
        currentChunkDictionaryPageOffset,
        currentChunkValueCount,
        compressedLength,
        uncompressedLength,
        currentSizeStatistics));
    this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength);
    this.uncompressedLength = 0;
    this.compressedLength = 0;
    this.currentChunkDictionaryPageOffset = 0;
    columnIndexBuilder = null;
    offsetIndexBuilder = null;
  }

  /**
   * ends a block once all column chunks have been written
   *
   * @throws IOException if there is an error while writing
   */
  public void endBlock() throws IOException {
    if (currentRecordCount == 0) {
      throw new ParquetEncodingException("End block with zero record");
    }

    state = state.endBlock();
    LOG.debug("{}: end block", out.getPos());
    currentBlock.setRowCount(currentRecordCount);
    currentBlock.setOrdinal(blocks.size());
    blocks.add(currentBlock);
    columnIndexes.add(currentColumnIndexes);
    offsetIndexes.add(currentOffsetIndexes);
    bloomFilters.add(currentBloomFilters);
    currentColumnIndexes = null;
    currentOffsetIndexes = null;
    currentBloomFilters = null;
    currentBlock = null;
  }

  /**
   * @param conf a configuration
   * @param file a file path to append the contents of to this file
   * @throws IOException if there is an error while reading or writing
   * @deprecated will be removed in 2.0.0; use {@link #appendFile(InputFile)} instead
   */
  @Deprecated
  public void appendFile(Configuration conf, Path file) throws IOException {
    try (ParquetFileReader reader = ParquetFileReader.open(conf, file)) {
      reader.appendTo(this);
    }
  }

  public void appendFile(InputFile file) throws IOException {
    try (ParquetFileReader reader = ParquetFileReader.open(file)) {
      reader.appendTo(this);
    }
  }

  /**
   * @param file        a file stream to read from
   * @param rowGroups   row groups to copy
   * @param dropColumns whether to drop columns from the file that are not in this file's schema
   * @throws IOException if there is an error while reading or writing
   * @deprecated will be removed in 2.0.0;
   * use {@link #appendRowGroups(SeekableInputStream, List, boolean)} instead
   */
  @Deprecated
  public void appendRowGroups(FSDataInputStream file, List rowGroups, boolean dropColumns)
      throws IOException {
    appendRowGroups(HadoopStreams.wrap(file), rowGroups, dropColumns);
  }

  public void appendRowGroups(SeekableInputStream file, List rowGroups, boolean dropColumns)
      throws IOException {
    for (BlockMetaData block : rowGroups) {
      appendRowGroup(file, block, dropColumns);
    }
  }

  /**
   * @param from        a file stream to read from
   * @param rowGroup    row group to copy
   * @param dropColumns whether to drop columns from the file that are not in this file's schema
   * @throws IOException if there is an error while reading or writing
   * @deprecated will be removed in 2.0.0;
   * use {@link #appendRowGroup(SeekableInputStream, BlockMetaData, boolean)} instead
   */
  @Deprecated
  public void appendRowGroup(FSDataInputStream from, BlockMetaData rowGroup, boolean dropColumns) throws IOException {
    appendRowGroup(HadoopStreams.wrap(from), rowGroup, dropColumns);
  }

  public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup, boolean dropColumns)
      throws IOException {
    startBlock(rowGroup.getRowCount());

    Map columnsToCopy = new HashMap();
    for (ColumnChunkMetaData chunk : rowGroup.getColumns()) {
      columnsToCopy.put(chunk.getPath().toDotString(), chunk);
    }

    List columnsInOrder = new ArrayList();

    for (ColumnDescriptor descriptor : schema.getColumns()) {
      String path = ColumnPath.get(descriptor.getPath()).toDotString();
      ColumnChunkMetaData chunk = columnsToCopy.remove(path);
      if (chunk != null) {
        columnsInOrder.add(chunk);
      } else {
        throw new IllegalArgumentException(
            String.format("Missing column '%s', cannot copy row group: %s", path, rowGroup));
      }
    }

    // complain if some columns would be dropped and that's not okay
    if (!dropColumns && !columnsToCopy.isEmpty()) {
      throw new IllegalArgumentException(String.format(
          "Columns cannot be copied (missing from target schema): %s",
          String.join(", ", columnsToCopy.keySet())));
    }

    // copy the data for all chunks
    long start = -1;
    long length = 0;
    long blockUncompressedSize = 0L;
    for (int i = 0; i < columnsInOrder.size(); i += 1) {
      ColumnChunkMetaData chunk = columnsInOrder.get(i);

      // get this chunk's start position in the new file
      long newChunkStart = out.getPos() + length;

      // add this chunk to be copied with any previous chunks
      if (start < 0) {
        // no previous chunk included, start at this chunk's starting pos
        start = chunk.getStartingPos();
      }
      length += chunk.getTotalSize();

      if ((i + 1) == columnsInOrder.size() || columnsInOrder.get(i + 1).getStartingPos() != (start + length)) {
        // not contiguous. do the copy now.
        copy(from, out, start, length);
        // reset to start at the next column chunk
        start = -1;
        length = 0;
      }

      // TODO: column/offset indexes are not copied
      // (it would require seeking to the end of the file for each row groups)
      currentColumnIndexes.add(null);
      currentOffsetIndexes.add(null);

      Offsets offsets = Offsets.getOffsets(from, chunk, newChunkStart);
      currentBlock.addColumn(ColumnChunkMetaData.get(
          chunk.getPath(),
          chunk.getPrimitiveType(),
          chunk.getCodec(),
          chunk.getEncodingStats(),
          chunk.getEncodings(),
          chunk.getStatistics(),
          offsets.firstDataPageOffset,
          offsets.dictionaryPageOffset,
          chunk.getValueCount(),
          chunk.getTotalSize(),
          chunk.getTotalUncompressedSize()));

      blockUncompressedSize += chunk.getTotalUncompressedSize();
    }

    currentBlock.setTotalByteSize(blockUncompressedSize);

    endBlock();
  }

  /**
   * @param descriptor  the descriptor for the target column
   * @param from        a file stream to read from
   * @param chunk       the column chunk to be copied
   * @param bloomFilter the bloomFilter for this chunk
   * @param columnIndex the column index for this chunk
   * @param offsetIndex the offset index for this chunk
   * @throws IOException
   */
  public void appendColumnChunk(
      ColumnDescriptor descriptor,
      SeekableInputStream from,
      ColumnChunkMetaData chunk,
      BloomFilter bloomFilter,
      ColumnIndex columnIndex,
      OffsetIndex offsetIndex)
      throws IOException {
    long start = chunk.getStartingPos();
    long length = chunk.getTotalSize();
    long newChunkStart = out.getPos();

    if (offsetIndex != null && newChunkStart != start) {
      offsetIndex =
          OffsetIndexBuilder.getBuilder().fromOffsetIndex(offsetIndex).build(newChunkStart - start);
    }

    copy(from, out, start, length);

    currentBloomFilters.put(String.join(".", descriptor.getPath()), bloomFilter);
    currentColumnIndexes.add(columnIndex);
    currentOffsetIndexes.add(offsetIndex);

    Offsets offsets = Offsets.getOffsets(from, chunk, newChunkStart);
    currentBlock.addColumn(ColumnChunkMetaData.get(
        chunk.getPath(),
        chunk.getPrimitiveType(),
        chunk.getCodec(),
        chunk.getEncodingStats(),
        chunk.getEncodings(),
        chunk.getStatistics(),
        offsets.firstDataPageOffset,
        offsets.dictionaryPageOffset,
        chunk.getValueCount(),
        chunk.getTotalSize(),
        chunk.getTotalUncompressedSize()));

    currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + chunk.getTotalUncompressedSize());
  }

  // Buffers for the copy function.
  private static final ThreadLocal COPY_BUFFER = ThreadLocal.withInitial(() -> new byte[8192]);

  /**
   * Copy from a FS input stream to an output stream. Thread-safe
   *
   * @param from   a {@link SeekableInputStream}
   * @param to     any {@link PositionOutputStream}
   * @param start  where in the from stream to start copying
   * @param length the number of bytes to copy
   * @throws IOException if there is an error while reading or writing
   */
  private static void copy(SeekableInputStream from, PositionOutputStream to, long start, long length)
      throws IOException {
    LOG.debug("Copying {} bytes at {} to {}", length, start, to.getPos());
    from.seek(start);
    long bytesCopied = 0;
    byte[] buffer = COPY_BUFFER.get();
    while (bytesCopied < length) {
      int bytesLeft = Math.toIntExact(length - bytesCopied);
      int bytesRead = from.read(buffer, 0, (Math.min(buffer.length, bytesLeft)));
      if (bytesRead < 0) {
        throw new IllegalArgumentException("Unexpected end of input file at " + start + bytesCopied);
      }
      to.write(buffer, 0, bytesRead);
      bytesCopied += bytesRead;
    }
  }

  /**
   * ends a file once all blocks have been written.
   * closes the file.
   *
   * @param extraMetaData the extra meta data to write in the footer
   * @throws IOException if there is an error while writing
   */
  public void end(Map extraMetaData) throws IOException {
    try {
      state = state.end();
      serializeColumnIndexes(columnIndexes, blocks, out, fileEncryptor);
      serializeOffsetIndexes(offsetIndexes, blocks, out, fileEncryptor);
      serializeBloomFilters(bloomFilters, blocks, out, fileEncryptor);
      LOG.debug("{}: end", out.getPos());
      this.footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks);
      serializeFooter(footer, out, fileEncryptor, metadataConverter);
    } finally {
      close();
    }
  }

  @Override
  public void close() throws IOException {
    if (closed) {
      return;
    }
    try (PositionOutputStream temp = out) {
      temp.flush();
      if (crcAllocator != null) {
        crcAllocator.close();
      }
    } finally {
      closed = true;
    }
  }

  private static void serializeColumnIndexes(
      List> columnIndexes,
      List blocks,
      PositionOutputStream out,
      InternalFileEncryptor fileEncryptor)
      throws IOException {
    LOG.debug("{}: column indexes", out.getPos());
    for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
      BlockMetaData block = blocks.get(bIndex);
      List columns = block.getColumns();
      List blockColumnIndexes = columnIndexes.get(bIndex);
      for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
        ColumnChunkMetaData column = columns.get(cIndex);
        org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter.toParquetColumnIndex(
            column.getPrimitiveType(), blockColumnIndexes.get(cIndex));
        if (columnIndex == null) {
          continue;
        }
        BlockCipher.Encryptor columnIndexEncryptor = null;
        byte[] columnIndexAAD = null;
        if (null != fileEncryptor) {
          InternalColumnEncryptionSetup columnEncryptionSetup =
              fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
          if (columnEncryptionSetup.isEncrypted()) {
            columnIndexEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
            columnIndexAAD = AesCipher.createModuleAAD(
                fileEncryptor.getFileAAD(),
                ModuleType.ColumnIndex,
                block.getOrdinal(),
                columnEncryptionSetup.getOrdinal(),
                -1);
          }
        }
        long offset = out.getPos();
        Util.writeColumnIndex(columnIndex, out, columnIndexEncryptor, columnIndexAAD);
        column.setColumnIndexReference(
            new IndexReference(offset, toIntWithCheck(out.getPos() - offset, "page")));
      }
    }
  }

  private static int toIntWithCheck(long size, String obj) {
    if ((int) size != size) {
      throw new ParquetSizeOverflowException(
          String.format("Cannot write %s larger than %s bytes: %s", obj, Integer.MAX_VALUE, size));
    }
    return (int) size;
  }

  private void mergeColumnStatistics(Statistics statistics, SizeStatistics sizeStatistics) {
    Preconditions.checkState(currentSizeStatistics != null, "Aggregate size statistics should not be null");
    currentSizeStatistics.mergeStatistics(sizeStatistics);
    if (!currentSizeStatistics.isValid()) {
      // Set page size statistics to null to clear state in the ColumnIndexBuilder.
      sizeStatistics = null;
    }

    // Do not merge statistics and build column index if any page statistics is invalid.
    if (currentStatistics != null && currentStatistics.isEmpty()) {
      return;
    }

    if (statistics == null || statistics.isEmpty()) {
      // The column index and statistics should be invalid if some page statistics are null or empty.
      // See PARQUET-2365 for more details
      currentStatistics =
          Statistics.getBuilderForReading(currentChunkType).build();
      columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder();
    } else if (currentStatistics == null) {
      // Copying the statistics if it is not initialized yet, so we have the correct typed one
      currentStatistics = statistics.copy();
      columnIndexBuilder.add(statistics, sizeStatistics);
    } else {
      currentStatistics.mergeStatistics(statistics);
      columnIndexBuilder.add(statistics, sizeStatistics);
    }
  }

  private static void serializeOffsetIndexes(
      List> offsetIndexes,
      List blocks,
      PositionOutputStream out,
      InternalFileEncryptor fileEncryptor)
      throws IOException {
    LOG.debug("{}: offset indexes", out.getPos());
    for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
      BlockMetaData block = blocks.get(bIndex);
      List columns = block.getColumns();
      List blockOffsetIndexes = offsetIndexes.get(bIndex);
      for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
        OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex);
        if (offsetIndex == null) {
          continue;
        }
        ColumnChunkMetaData column = columns.get(cIndex);
        BlockCipher.Encryptor offsetIndexEncryptor = null;
        byte[] offsetIndexAAD = null;
        if (null != fileEncryptor) {
          InternalColumnEncryptionSetup columnEncryptionSetup =
              fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
          if (columnEncryptionSetup.isEncrypted()) {
            offsetIndexEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
            offsetIndexAAD = AesCipher.createModuleAAD(
                fileEncryptor.getFileAAD(),
                ModuleType.OffsetIndex,
                block.getOrdinal(),
                columnEncryptionSetup.getOrdinal(),
                -1);
          }
        }
        long offset = out.getPos();
        Util.writeOffsetIndex(
            ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex),
            out,
            offsetIndexEncryptor,
            offsetIndexAAD);
        column.setOffsetIndexReference(
            new IndexReference(offset, toIntWithCheck(out.getPos() - offset, "page")));
      }
    }
  }

  private static void serializeBloomFilters(
      List> bloomFilters,
      List blocks,
      PositionOutputStream out,
      InternalFileEncryptor fileEncryptor)
      throws IOException {
    LOG.debug("{}: bloom filters", out.getPos());
    for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
      BlockMetaData block = blocks.get(bIndex);
      List columns = block.getColumns();
      Map blockBloomFilters = bloomFilters.get(bIndex);
      if (blockBloomFilters.isEmpty()) continue;
      for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
        ColumnChunkMetaData column = columns.get(cIndex);
        BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
        if (bloomFilter == null) {
          continue;
        }

        long offset = out.getPos();
        column.setBloomFilterOffset(offset);

        BlockCipher.Encryptor bloomFilterEncryptor = null;
        byte[] bloomFilterHeaderAAD = null;
        byte[] bloomFilterBitsetAAD = null;
        if (null != fileEncryptor) {
          InternalColumnEncryptionSetup columnEncryptionSetup =
              fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
          if (columnEncryptionSetup.isEncrypted()) {
            bloomFilterEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
            int columnOrdinal = columnEncryptionSetup.getOrdinal();
            bloomFilterHeaderAAD = AesCipher.createModuleAAD(
                fileEncryptor.getFileAAD(),
                ModuleType.BloomFilterHeader,
                block.getOrdinal(),
                columnOrdinal,
                -1);
            bloomFilterBitsetAAD = AesCipher.createModuleAAD(
                fileEncryptor.getFileAAD(),
                ModuleType.BloomFilterBitset,
                block.getOrdinal(),
                columnOrdinal,
                -1);
          }
        }

        Util.writeBloomFilterHeader(
            ParquetMetadataConverter.toBloomFilterHeader(bloomFilter),
            out,
            bloomFilterEncryptor,
            bloomFilterHeaderAAD);

        ByteArrayOutputStream tempOutStream = new ByteArrayOutputStream();
        bloomFilter.writeTo(tempOutStream);
        byte[] serializedBitset = tempOutStream.toByteArray();
        if (null != bloomFilterEncryptor) {
          serializedBitset = bloomFilterEncryptor.encrypt(serializedBitset, bloomFilterBitsetAAD);
        }
        out.write(serializedBitset);

        int length = Math.toIntExact(out.getPos() - offset);
        column.setBloomFilterLength(length);
      }
    }
  }

  private static void serializeFooter(
      ParquetMetadata footer,
      PositionOutputStream out,
      InternalFileEncryptor fileEncryptor,
      ParquetMetadataConverter metadataConverter)
      throws IOException {

    // Unencrypted file
    if (null == fileEncryptor) {
      long footerIndex = out.getPos();
      org.apache.parquet.format.FileMetaData parquetMetadata =
          metadataConverter.toParquetMetadata(CURRENT_VERSION, footer);
      writeFileMetaData(parquetMetadata, out);
      LOG.debug("{}: footer length = {}", out.getPos(), (out.getPos() - footerIndex));
      BytesUtils.writeIntLittleEndian(out, toIntWithCheck(out.getPos() - footerIndex, "footer"));
      out.write(MAGIC);
      return;
    }

    org.apache.parquet.format.FileMetaData parquetMetadata =
        metadataConverter.toParquetMetadata(CURRENT_VERSION, footer, fileEncryptor);

    // Encrypted file with plaintext footer
    if (!fileEncryptor.isFooterEncrypted()) {
      long footerIndex = out.getPos();
      parquetMetadata.setEncryption_algorithm(fileEncryptor.getEncryptionAlgorithm());
      // create footer signature (nonce + tag of encrypted footer)
      byte[] footerSigningKeyMetaData = fileEncryptor.getFooterSigningKeyMetaData();
      if (null != footerSigningKeyMetaData) {
        parquetMetadata.setFooter_signing_key_metadata(footerSigningKeyMetaData);
      }
      ByteArrayOutputStream tempOutStream = new ByteArrayOutputStream();
      writeFileMetaData(parquetMetadata, tempOutStream);
      byte[] serializedFooter = tempOutStream.toByteArray();
      byte[] footerAAD = AesCipher.createFooterAAD(fileEncryptor.getFileAAD());
      byte[] encryptedFooter = fileEncryptor.getSignedFooterEncryptor().encrypt(serializedFooter, footerAAD);
      byte[] signature = new byte[AesCipher.NONCE_LENGTH + AesCipher.GCM_TAG_LENGTH];
      System.arraycopy(
          encryptedFooter,
          ModuleCipherFactory.SIZE_LENGTH,
          signature,
          0,
          AesCipher.NONCE_LENGTH); // copy Nonce
      System.arraycopy(
          encryptedFooter,
          encryptedFooter.length - AesCipher.GCM_TAG_LENGTH,
          signature,
          AesCipher.NONCE_LENGTH,
          AesCipher.GCM_TAG_LENGTH); // copy GCM Tag
      out.write(serializedFooter);
      out.write(signature);
      LOG.debug("{}: footer and signature length = {}", out.getPos(), (out.getPos() - footerIndex));
      BytesUtils.writeIntLittleEndian(out, toIntWithCheck(out.getPos() - footerIndex, "page"));
      out.write(MAGIC);
      return;
    }

    // Encrypted file with encrypted footer
    long cryptoFooterIndex = out.getPos();
    writeFileCryptoMetaData(fileEncryptor.getFileCryptoMetaData(), out);
    byte[] footerAAD = AesCipher.createFooterAAD(fileEncryptor.getFileAAD());
    writeFileMetaData(parquetMetadata, out, fileEncryptor.getFooterEncryptor(), footerAAD);
    int combinedMetaDataLength = toIntWithCheck(out.getPos() - cryptoFooterIndex, "page");
    LOG.debug("{}: crypto metadata and footer length = {}", out.getPos(), combinedMetaDataLength);
    BytesUtils.writeIntLittleEndian(out, combinedMetaDataLength);
    out.write(EFMAGIC);
  }

  public ParquetMetadata getFooter() {
    Preconditions.checkState(state == STATE.ENDED, "Cannot return unfinished footer.");
    return footer;
  }

  /**
   * Given a list of metadata files, merge them into a single ParquetMetadata
   * Requires that the schemas be compatible, and the extraMetadata be exactly equal.
   *
   * @param files a list of files to merge metadata from
   * @param conf  a configuration
   * @return merged parquet metadata for the files
   * @throws IOException if there is an error while writing
   * @deprecated metadata files are not recommended and will be removed in 2.0.0
   */
  @Deprecated
  public static ParquetMetadata mergeMetadataFiles(List files, Configuration conf) throws IOException {
    return mergeMetadataFiles(files, conf, new StrictKeyValueMetadataMergeStrategy());
  }

  /**
   * Given a list of metadata files, merge them into a single ParquetMetadata
   * Requires that the schemas be compatible, and the extraMetadata be exactly equal.
   *
   * @param files                         a list of files to merge metadata from
   * @param conf                          a configuration
   * @param keyValueMetadataMergeStrategy strategy to merge values for same key, if there are multiple
   * @return merged parquet metadata for the files
   * @throws IOException if there is an error while writing
   * @deprecated metadata files are not recommended and will be removed in 2.0.0
   */
  @Deprecated
  public static ParquetMetadata mergeMetadataFiles(
      List files, Configuration conf, KeyValueMetadataMergeStrategy keyValueMetadataMergeStrategy)
      throws IOException {
    Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");

    GlobalMetaData globalMetaData = null;
    List blocks = new ArrayList();

    for (Path p : files) {
      ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
      FileMetaData fmd = pmd.getFileMetaData();
      globalMetaData = mergeInto(fmd, globalMetaData, true);
      blocks.addAll(pmd.getBlocks());
    }

    // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
    return new ParquetMetadata(globalMetaData.merge(keyValueMetadataMergeStrategy), blocks);
  }

  /**
   * Given a list of metadata files, merge them into a single metadata file.
   * Requires that the schemas be compatible, and the extraMetaData be exactly equal.
   * This is useful when merging 2 directories of parquet files into a single directory, as long
   * as both directories were written with compatible schemas and equal extraMetaData.
   *
   * @param files      a list of files to merge metadata from
   * @param outputPath path to write merged metadata to
   * @param conf       a configuration
   * @throws IOException if there is an error while reading or writing
   * @deprecated metadata files are not recommended and will be removed in 2.0.0
   */
  @Deprecated
  public static void writeMergedMetadataFile(List files, Path outputPath, Configuration conf)
      throws IOException {
    ParquetMetadata merged = mergeMetadataFiles(files, conf);
    writeMetadataFile(outputPath, merged, outputPath.getFileSystem(conf));
  }

  /**
   * writes a _metadata and _common_metadata file
   *
   * @param configuration the configuration to use to get the FileSystem
   * @param outputPath    the directory to write the _metadata file to
   * @param footers       the list of footers to merge
   * @throws IOException if there is an error while writing
   * @deprecated metadata files are not recommended and will be removed in 2.0.0
   */
  @Deprecated
  public static void writeMetadataFile(Configuration configuration, Path outputPath, List
footers) throws IOException { writeMetadataFile(configuration, outputPath, footers, JobSummaryLevel.ALL); } /** * writes _common_metadata file, and optionally a _metadata file depending on the {@link JobSummaryLevel} provided * * @param configuration the configuration to use to get the FileSystem * @param outputPath the directory to write the _metadata file to * @param footers the list of footers to merge * @param level level of summary to write * @throws IOException if there is an error while writing * @deprecated metadata files are not recommended and will be removed in 2.0.0 */ @Deprecated public static void writeMetadataFile( Configuration configuration, Path outputPath, List
footers, JobSummaryLevel level) throws IOException { Preconditions.checkArgument( level == JobSummaryLevel.ALL || level == JobSummaryLevel.COMMON_ONLY, "Unsupported level: %s", level); FileSystem fs = outputPath.getFileSystem(configuration); outputPath = outputPath.makeQualified(fs); ParquetMetadata metadataFooter = mergeFooters(outputPath, footers); if (level == JobSummaryLevel.ALL) { writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_METADATA_FILE); } metadataFooter.getBlocks().clear(); writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_COMMON_METADATA_FILE); } /** * @deprecated metadata files are not recommended and will be removed in 2.0.0 */ @Deprecated private static void writeMetadataFile( Path outputPathRoot, ParquetMetadata metadataFooter, FileSystem fs, String parquetMetadataFile) throws IOException { Path metaDataPath = new Path(outputPathRoot, parquetMetadataFile); writeMetadataFile(metaDataPath, metadataFooter, fs); } /** * @deprecated metadata files are not recommended and will be removed in 2.0.0 */ @Deprecated private static void writeMetadataFile(Path outputPath, ParquetMetadata metadataFooter, FileSystem fs) throws IOException { try (PositionOutputStream metadata = HadoopStreams.wrap(fs.create(outputPath))) { metadata.write(MAGIC); serializeFooter(metadataFooter, metadata, null, new ParquetMetadataConverter()); metadata.flush(); } } /** * Will merge the metadata of all the footers together * * @param root the directory containing all footers * @param footers the list files footers to merge * @return the global meta data for all the footers */ static ParquetMetadata mergeFooters(Path root, List
footers) { return mergeFooters(root, footers, new StrictKeyValueMetadataMergeStrategy()); } /** * Will merge the metadata of all the footers together * * @param root the directory containing all footers * @param footers the list files footers to merge * @param keyValueMergeStrategy strategy to merge values for a given key (if there are multiple values) * @return the global meta data for all the footers */ static ParquetMetadata mergeFooters( Path root, List
footers, KeyValueMetadataMergeStrategy keyValueMergeStrategy) { String rootPath = root.toUri().getPath(); GlobalMetaData fileMetaData = null; List blocks = new ArrayList(); for (Footer footer : footers) { String footerPath = footer.getFile().toUri().getPath(); if (!footerPath.startsWith(rootPath)) { throw new ParquetEncodingException( footerPath + " invalid: all the files must be contained in the root " + root); } footerPath = footerPath.substring(rootPath.length()); while (footerPath.startsWith("/")) { footerPath = footerPath.substring(1); } fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { block.setPath(footerPath); blocks.add(block); } } return new ParquetMetadata(fileMetaData.merge(keyValueMergeStrategy), blocks); } /** * @return the current position in the underlying file * @throws IOException if there is an error while getting the current stream's position */ public long getPos() throws IOException { return out.getPos(); } public long getNextRowGroupSize() throws IOException { return alignment.nextRowGroupSize(out); } /** * Will merge the metadata of all the footers together * * @param footers the list files footers to merge * @return the global meta data for all the footers */ static GlobalMetaData getGlobalMetaData(List
footers) { return getGlobalMetaData(footers, true); } static GlobalMetaData getGlobalMetaData(List
footers, boolean strict) { GlobalMetaData fileMetaData = null; for (Footer footer : footers) { ParquetMetadata currentMetadata = footer.getParquetMetadata(); fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict); } return fileMetaData; } /** * Will return the result of merging toMerge into mergedMetadata * * @param toMerge the metadata toMerge * @param mergedMetadata the reference metadata to merge into * @return the result of the merge */ static GlobalMetaData mergeInto(FileMetaData toMerge, GlobalMetaData mergedMetadata) { return mergeInto(toMerge, mergedMetadata, true); } static GlobalMetaData mergeInto(FileMetaData toMerge, GlobalMetaData mergedMetadata, boolean strict) { MessageType schema = null; Map> newKeyValues = new HashMap>(); Set createdBy = new HashSet(); if (mergedMetadata != null) { schema = mergedMetadata.getSchema(); newKeyValues.putAll(mergedMetadata.getKeyValueMetaData()); createdBy.addAll(mergedMetadata.getCreatedBy()); } if ((schema == null && toMerge.getSchema() != null) || (schema != null && !schema.equals(toMerge.getSchema()))) { schema = mergeInto(toMerge.getSchema(), schema, strict); } for (Entry entry : toMerge.getKeyValueMetaData().entrySet()) { Set values = newKeyValues.computeIfAbsent(entry.getKey(), k -> new LinkedHashSet()); values.add(entry.getValue()); } createdBy.add(toMerge.getCreatedBy()); return new GlobalMetaData(schema, newKeyValues, createdBy); } /** * will return the result of merging toMerge into mergedSchema * * @param toMerge the schema to merge into mergedSchema * @param mergedSchema the schema to append the fields to * @return the resulting schema */ static MessageType mergeInto(MessageType toMerge, MessageType mergedSchema) { return mergeInto(toMerge, mergedSchema, true); } /** * will return the result of merging toMerge into mergedSchema * * @param toMerge the schema to merge into mergedSchema * @param mergedSchema the schema to append the fields to * @param strict should schema primitive types match * @return the resulting schema */ static MessageType mergeInto(MessageType toMerge, MessageType mergedSchema, boolean strict) { if (mergedSchema == null) { return toMerge; } return mergedSchema.union(toMerge, strict); } private interface AlignmentStrategy { void alignForRowGroup(PositionOutputStream out) throws IOException; long nextRowGroupSize(PositionOutputStream out) throws IOException; } private static class NoAlignment implements AlignmentStrategy { public static NoAlignment get(long rowGroupSize) { return new NoAlignment(rowGroupSize); } private final long rowGroupSize; private NoAlignment(long rowGroupSize) { this.rowGroupSize = rowGroupSize; } @Override public void alignForRowGroup(PositionOutputStream out) {} @Override public long nextRowGroupSize(PositionOutputStream out) { return rowGroupSize; } } /** * Alignment strategy that pads when less than half the row group size is * left before the next DFS block. */ private static class PaddingAlignment implements AlignmentStrategy { private static final byte[] zeros = new byte[4096]; public static PaddingAlignment get(long dfsBlockSize, long rowGroupSize, int maxPaddingSize) { return new PaddingAlignment(dfsBlockSize, rowGroupSize, maxPaddingSize); } protected final long dfsBlockSize; protected final long rowGroupSize; protected final int maxPaddingSize; private PaddingAlignment(long dfsBlockSize, long rowGroupSize, int maxPaddingSize) { this.dfsBlockSize = dfsBlockSize; this.rowGroupSize = rowGroupSize; this.maxPaddingSize = maxPaddingSize; } @Override public void alignForRowGroup(PositionOutputStream out) throws IOException { long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize); if (isPaddingNeeded(remaining)) { LOG.debug( "Adding {} bytes of padding (row group size={}B, block size={}B)", remaining, rowGroupSize, dfsBlockSize); for (; remaining > 0; remaining -= zeros.length) { out.write(zeros, 0, (int) Math.min((long) zeros.length, remaining)); } } } @Override public long nextRowGroupSize(PositionOutputStream out) throws IOException { if (maxPaddingSize <= 0) { return rowGroupSize; } long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize); if (isPaddingNeeded(remaining)) { return rowGroupSize; } return Math.min(remaining, rowGroupSize); } protected boolean isPaddingNeeded(long remaining) { return (remaining <= maxPaddingSize); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy