All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.hadoop.rewrite.ParquetRewriter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.hadoop.rewrite;

import static org.apache.parquet.column.ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH;
import static org.apache.parquet.column.ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH;
import static org.apache.parquet.crypto.ModuleCipherFactory.ModuleType;
import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT;

import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Queue;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.ParquetReadOptions;
import org.apache.parquet.Preconditions;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnReader;
import org.apache.parquet.column.ColumnWriteStore;
import org.apache.parquet.column.ColumnWriter;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.impl.ColumnReadStoreImpl;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
import org.apache.parquet.compression.CompressionCodecFactory;
import org.apache.parquet.conf.ParquetConfiguration;
import org.apache.parquet.crypto.AesCipher;
import org.apache.parquet.crypto.InternalColumnEncryptionSetup;
import org.apache.parquet.crypto.InternalFileEncryptor;
import org.apache.parquet.format.BlockCipher;
import org.apache.parquet.format.DataPageHeader;
import org.apache.parquet.format.DataPageHeaderV2;
import org.apache.parquet.format.DictionaryPageHeader;
import org.apache.parquet.format.PageHeader;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.CodecFactory;
import org.apache.parquet.hadoop.ColumnChunkPageWriteStore;
import org.apache.parquet.hadoop.IndexCache;
import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader;
import org.apache.parquet.hadoop.util.HadoopCodecs;
import org.apache.parquet.internal.column.columnindex.ColumnIndex;
import org.apache.parquet.internal.column.columnindex.OffsetIndex;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.io.OutputFile;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.io.api.Converter;
import org.apache.parquet.io.api.GroupConverter;
import org.apache.parquet.io.api.PrimitiveConverter;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.InvalidSchemaException;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Rewrites multiple input files into a single output file.
 * 

* Supported functionality: *

    *
  • Merging multiple files into a single one
  • *
  • Applying column transformations
  • *
  • Joining with extra files with a different schema
  • *
*

* Note that the total number of row groups from all input files is preserved in the output file. * This may not be optimal if row groups are very small and will not solve small file problems. Instead, it will * make it worse to have a large file footer in the output file. *

*

Merging multiple files into a single output files

* Use {@link RewriteOptions.Builder}'s constructor or methods to provide inputFiles. * Please note the schema of all inputFiles must be the same, otherwise the rewrite will fail. *

*

Applying column transformations

* Some supported column transformations: pruning, masking, renaming, encrypting, changing a codec. * See {@link RewriteOptions} and {@link RewriteOptions.Builder} for the full list with description. *

*

Joining with extra files with a different schema

* Use {@link RewriteOptions.Builder}'s constructor or methods to provide inputFilesToJoin. * Please note the schema of all inputFilesToJoin must be the same, otherwise the rewrite will fail. * Requirements for a joining the main inputFiles(left) and inputFilesToJoin(right): *
    *
  • the number of files might be different on the left and right,
  • *
  • the schema of files inside of each group(left/right) must be the same, but those two schemas not necessarily should be equal,
  • *
  • the total number of row groups must be the same on the left and right,
  • *
  • the total number of rows must be the same on the left and right,
  • *
  • the global ordering of rows must be the same on the left and right.
  • *
*/ public class ParquetRewriter implements Closeable { // Key to store original writer version in the file key-value metadata public static final String ORIGINAL_CREATED_BY_KEY = "original.created.by"; private static final Logger LOG = LoggerFactory.getLogger(ParquetRewriter.class); private final int pageBufferSize = ParquetProperties.DEFAULT_PAGE_SIZE * 2; private final byte[] pageBuffer = new byte[pageBufferSize]; // Configurations for the new file private final CompressionCodecName newCodecName; private Map maskColumns = null; private Set encryptColumns = null; private boolean encryptMode = false; private final Map extraMetaData; // Writer to rewrite the input files private final ParquetFileWriter writer; // Number of blocks written which is used to keep track of the actual row group ordinal private int numBlocksRewritten = 0; // Reader and relevant states of the in-processing input file private final Queue inputFiles = new LinkedList<>(); private final Queue inputFilesToJoin = new LinkedList<>(); private final MessageType outSchema; // The index cache strategy private final IndexCache.CacheStrategy indexCacheStrategy; private final boolean overwriteInputWithJoinColumns; private final InternalFileEncryptor nullColumnEncryptor; private final Map renamedColumns; public ParquetRewriter(RewriteOptions options) throws IOException { this.newCodecName = options.getNewCodecName(); this.indexCacheStrategy = options.getIndexCacheStrategy(); this.overwriteInputWithJoinColumns = options.getOverwriteInputWithJoinColumns(); this.renamedColumns = options.getRenameColumns(); ParquetConfiguration conf = options.getParquetConfiguration(); this.inputFiles.addAll(getFileReaders(options.getParquetInputFiles(), conf)); this.inputFilesToJoin.addAll(getFileReaders(options.getParquetInputFilesToJoin(), conf)); this.outSchema = pruneColumnsInSchema(getSchema(), options.getPruneColumns()); this.extraMetaData = getExtraMetadata(options); ensureSameSchema(inputFiles); ensureSameSchema(inputFilesToJoin); ensureRowCount(); ensureRenamingCorrectness(outSchema, renamedColumns); OutputFile out = options.getParquetOutputFile(); LOG.info( "Start rewriting {} input file(s) {} to {}", inputFiles.size() + inputFilesToJoin.size(), Stream.concat(options.getParquetInputFiles().stream(), options.getParquetInputFilesToJoin().stream()) .collect(Collectors.toList()), out); if (options.getMaskColumns() != null) { this.maskColumns = new HashMap<>(); for (Map.Entry col : options.getMaskColumns().entrySet()) { maskColumns.put(ColumnPath.fromDotString(col.getKey()), col.getValue()); } } if (options.getEncryptColumns() != null && options.getFileEncryptionProperties() != null) { this.encryptColumns = convertToColumnPaths(options.getEncryptColumns()); this.encryptMode = true; } ParquetFileWriter.Mode writerMode = ParquetFileWriter.Mode.CREATE; this.writer = new ParquetFileWriter( out, renamedColumns.isEmpty() ? outSchema : getSchemaWithRenamedColumns(this.outSchema), writerMode, DEFAULT_BLOCK_SIZE, MAX_PADDING_SIZE_DEFAULT, DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, DEFAULT_STATISTICS_TRUNCATE_LENGTH, ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED, options.getFileEncryptionProperties()); writer.start(); // column nullification requires a separate encryptor and forcing other columns encryption initialization if (options.getFileEncryptionProperties() == null) { this.nullColumnEncryptor = null; } else { this.nullColumnEncryptor = new InternalFileEncryptor(options.getFileEncryptionProperties()); List columns = getSchemaWithRenamedColumns(this.outSchema).getColumns(); for (int i = 0; i < columns.size(); i++) { writer.getEncryptor() .getColumnSetup(ColumnPath.get(columns.get(i).getPath()), true, i); } } } // TODO: Should we mark it as deprecated to encourage the main constructor usage? it is also used only from // deprecated classes atm // Ctor for legacy CompressionConverter and ColumnMasker public ParquetRewriter( TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType outSchema, String originalCreatedBy, CompressionCodecName codecName, List maskColumns, MaskMode maskMode) { this.writer = writer; this.outSchema = outSchema; this.newCodecName = codecName; this.extraMetaData = new HashMap<>(meta.getFileMetaData().getKeyValueMetaData()); this.extraMetaData.put( ORIGINAL_CREATED_BY_KEY, originalCreatedBy != null ? originalCreatedBy : meta.getFileMetaData().getCreatedBy()); if (maskColumns != null && maskMode != null) { this.maskColumns = new HashMap<>(); for (String col : maskColumns) { this.maskColumns.put(ColumnPath.fromDotString(col), maskMode); } } this.inputFiles.add(reader); this.indexCacheStrategy = IndexCache.CacheStrategy.NONE; this.overwriteInputWithJoinColumns = false; this.nullColumnEncryptor = null; this.renamedColumns = new HashMap<>(); } private MessageType getSchema() { MessageType schemaMain = inputFiles.peek().getFooter().getFileMetaData().getSchema(); if (inputFilesToJoin.isEmpty()) { return schemaMain; } else { Map fieldNames = new LinkedHashMap<>(); schemaMain.getFields().forEach(x -> fieldNames.put(x.getName(), x)); inputFilesToJoin .peek() .getFooter() .getFileMetaData() .getSchema() .getFields() .forEach(x -> { if (!fieldNames.containsKey(x.getName())) { fieldNames.put(x.getName(), x); } else if (overwriteInputWithJoinColumns) { LOG.info("Column {} in inputFiles is overwritten by inputFilesToJoin side", x.getName()); fieldNames.put(x.getName(), x); } }); return new MessageType(schemaMain.getName(), new ArrayList<>(fieldNames.values())); } } private MessageType getSchemaWithRenamedColumns(MessageType schema) { List fields = schema.getFields().stream() .map(type -> { if (!renamedColumns.containsKey(type.getName())) { return type; } else if (type.isPrimitive()) { return new PrimitiveType( type.getRepetition(), type.asPrimitiveType().getPrimitiveTypeName(), renamedColumns.get(type.getName())); } else { return new GroupType( type.getRepetition(), renamedColumns.get(type.getName()), type.asGroupType().getFields()); } }) .collect(Collectors.toList()); return new MessageType(schema.getName(), fields); } private Map getExtraMetadata(RewriteOptions options) { List allFiles; if (options.getIgnoreJoinFilesMetadata()) { allFiles = new ArrayList<>(inputFiles); } else { allFiles = Stream.concat(inputFiles.stream(), inputFilesToJoin.stream()) .collect(Collectors.toList()); } Map result = new HashMap<>(); result.put( ORIGINAL_CREATED_BY_KEY, allFiles.stream() .map(x -> x.getFooter().getFileMetaData().getCreatedBy()) .collect(Collectors.toSet()) .stream() .reduce((a, b) -> a + "\n" + b) .orElse("")); allFiles.forEach(x -> result.putAll(x.getFileMetaData().getKeyValueMetaData())); return result; } private void ensureRowCount() { if (!inputFilesToJoin.isEmpty()) { List blocksRowCountsL = inputFiles.stream() .flatMap(x -> x.getFooter().getBlocks().stream().map(BlockMetaData::getRowCount)) .collect(Collectors.toList()); List blocksRowCountsR = inputFilesToJoin.stream() .flatMap(x -> x.getFooter().getBlocks().stream().map(BlockMetaData::getRowCount)) .collect(Collectors.toList()); if (!blocksRowCountsL.equals(blocksRowCountsR)) { throw new IllegalArgumentException( "The number of rows in each block must match! Left blocks row counts: " + blocksRowCountsL + ", right blocks row counts" + blocksRowCountsR + "."); } } } private Queue getFileReaders(List inputFiles, ParquetConfiguration conf) { LinkedList inputFileReaders = new LinkedList<>(); for (InputFile inputFile : inputFiles) { try { TransParquetFileReader reader = new TransParquetFileReader( inputFile, ParquetReadOptions.builder(conf).build()); inputFileReaders.add(reader); } catch (IOException e) { throw new IllegalArgumentException("Failed to open input file: " + inputFile, e); } } return inputFileReaders; } private void ensureSameSchema(Queue inputFileReaders) { MessageType schema = null; for (TransParquetFileReader reader : inputFileReaders) { MessageType newSchema = reader.getFooter().getFileMetaData().getSchema(); if (schema == null) { schema = newSchema; } else { // Now we enforce equality of schemas from input files for simplicity. if (!schema.equals(newSchema)) { String file = reader.getFile(); LOG.error( "Input files have different schemas, expect: {}, input: {}, current file: {}", schema, newSchema, file); throw new InvalidSchemaException("Input files have different schemas, current file: " + file); } } } } private void ensureRenamingCorrectness(MessageType schema, Map renameMap) { Set columns = schema.getFields().stream().map(Type::getName).collect(Collectors.toSet()); renameMap.forEach((src, dst) -> { if (!columns.contains(src)) { String msg = String.format("Column to rename '%s' is not found in input files schema", src); LOG.error(msg); throw new IllegalArgumentException(msg); } else if (columns.contains(dst)) { String msg = String.format("Renamed column target name '%s' is already present in a schema", dst); LOG.error(msg); throw new IllegalArgumentException(msg); } }); } @Override public void close() throws IOException { writer.end(extraMetaData); } public void processBlocks() throws IOException { TransParquetFileReader readerToJoin = null; IndexCache indexCacheToJoin = null; int blockIdxToJoin = 0; List outColumns = outSchema.getColumns(); while (!inputFiles.isEmpty()) { TransParquetFileReader reader = inputFiles.poll(); LOG.info("Rewriting input file: {}, remaining files: {}", reader.getFile(), inputFiles.size()); ParquetMetadata meta = reader.getFooter(); Set columnPaths = meta.getFileMetaData().getSchema().getColumns().stream() .map(x -> ColumnPath.get(x.getPath())) .collect(Collectors.toSet()); IndexCache indexCache = IndexCache.create(reader, columnPaths, indexCacheStrategy, true); for (int blockIdx = 0; blockIdx < meta.getBlocks().size(); blockIdx++) { BlockMetaData blockMetaData = meta.getBlocks().get(blockIdx); writer.startBlock(blockMetaData.getRowCount()); indexCache.setBlockMetadata(blockMetaData); Map pathToChunk = blockMetaData.getColumns().stream().collect(Collectors.toMap(x -> x.getPath(), x -> x)); if (!inputFilesToJoin.isEmpty()) { if (readerToJoin == null || ++blockIdxToJoin == readerToJoin.getFooter().getBlocks().size()) { if (readerToJoin != null) readerToJoin.close(); blockIdxToJoin = 0; readerToJoin = inputFilesToJoin.poll(); Set columnPathsToJoin = readerToJoin.getFileMetaData().getSchema().getColumns().stream() .map(x -> ColumnPath.get(x.getPath())) .collect(Collectors.toSet()); if (indexCacheToJoin != null) { indexCacheToJoin.clean(); } indexCacheToJoin = IndexCache.create(readerToJoin, columnPathsToJoin, indexCacheStrategy, true); indexCacheToJoin.setBlockMetadata( readerToJoin.getFooter().getBlocks().get(blockIdxToJoin)); } else { blockIdxToJoin++; indexCacheToJoin.setBlockMetadata( readerToJoin.getFooter().getBlocks().get(blockIdxToJoin)); } } for (int outColumnIdx = 0; outColumnIdx < outColumns.size(); outColumnIdx++) { ColumnPath colPath = ColumnPath.get(outColumns.get(outColumnIdx).getPath()); if (readerToJoin != null) { Optional chunkToJoin = readerToJoin.getFooter().getBlocks().get(blockIdxToJoin).getColumns().stream() .filter(x -> x.getPath().equals(colPath)) .findFirst(); if (chunkToJoin.isPresent() && (overwriteInputWithJoinColumns || !columnPaths.contains(colPath))) { processBlock( readerToJoin, blockIdxToJoin, outColumnIdx, indexCacheToJoin, chunkToJoin.get()); } else { processBlock(reader, blockIdx, outColumnIdx, indexCache, pathToChunk.get(colPath)); } } else { processBlock(reader, blockIdx, outColumnIdx, indexCache, pathToChunk.get(colPath)); } } writer.endBlock(); indexCache.clean(); numBlocksRewritten++; } indexCache.clean(); LOG.info("Finish rewriting input file: {}", reader.getFile()); reader.close(); } if (readerToJoin != null) readerToJoin.close(); } private ColumnPath normalizeFieldsInPath(ColumnPath path) { if (renamedColumns.isEmpty()) { return path; } else { String[] pathArray = path.toArray(); pathArray[0] = renamedColumns.getOrDefault(pathArray[0], pathArray[0]); return ColumnPath.get(pathArray); } } private PrimitiveType normalizeNameInType(PrimitiveType type) { if (renamedColumns.isEmpty()) { return type; } else { return new PrimitiveType( type.getRepetition(), type.asPrimitiveType().getPrimitiveTypeName(), renamedColumns.getOrDefault(type.getName(), type.getName())); } } private void processBlock( TransParquetFileReader reader, int blockIdx, int outColumnIdx, IndexCache indexCache, ColumnChunkMetaData chunk) throws IOException { if (chunk.isEncrypted()) { throw new IOException("Column " + chunk.getPath().toDotString() + " is already encrypted"); } ColumnChunkMetaData chunkNormalized = chunk; if (!renamedColumns.isEmpty()) { // Keep an eye if this get stale because of ColumnChunkMetaData change chunkNormalized = ColumnChunkMetaData.get( normalizeFieldsInPath(chunk.getPath()), normalizeNameInType(chunk.getPrimitiveType()), chunk.getCodec(), chunk.getEncodingStats(), chunk.getEncodings(), chunk.getStatistics(), chunk.getFirstDataPageOffset(), chunk.getDictionaryPageOffset(), chunk.getValueCount(), chunk.getTotalSize(), chunk.getTotalUncompressedSize(), chunk.getSizeStatistics()); } ColumnDescriptor descriptorOriginal = outSchema.getColumns().get(outColumnIdx); ColumnDescriptor descriptorRenamed = getSchemaWithRenamedColumns(outSchema).getColumns().get(outColumnIdx); BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(blockIdx); String originalCreatedBy = reader.getFileMetaData().getCreatedBy(); reader.setStreamPosition(chunk.getStartingPos()); CompressionCodecName newCodecName = this.newCodecName == null ? chunk.getCodec() : this.newCodecName; boolean encryptColumn = encryptMode && encryptColumns != null && encryptColumns.contains(chunk.getPath()); if (maskColumns != null && maskColumns.containsKey(chunk.getPath())) { // Mask column and compress it again. MaskMode maskMode = maskColumns.get(chunk.getPath()); if (maskMode.equals(MaskMode.NULLIFY)) { Type.Repetition repetition = descriptorOriginal.getPrimitiveType().getRepetition(); if (repetition.equals(Type.Repetition.REQUIRED)) { throw new IOException("Required column [" + descriptorOriginal.getPrimitiveType().getName() + "] cannot be nullified"); } nullifyColumn( reader, blockIdx, descriptorOriginal, chunk, writer, newCodecName, encryptColumn, originalCreatedBy); } else { throw new UnsupportedOperationException("Only nullify is supported for now"); } } else if (encryptMode || this.newCodecName != null) { // Prepare encryption context ColumnChunkEncryptorRunTime columnChunkEncryptorRunTime = null; if (encryptMode) { columnChunkEncryptorRunTime = new ColumnChunkEncryptorRunTime(writer.getEncryptor(), chunk, numBlocksRewritten, outColumnIdx); } // Translate compression and/or encryption writer.startColumn(descriptorRenamed, chunk.getValueCount(), newCodecName); processChunk( reader, blockMetaData.getRowCount(), chunk, newCodecName, columnChunkEncryptorRunTime, encryptColumn, indexCache.getBloomFilter(chunk), indexCache.getColumnIndex(chunk), indexCache.getOffsetIndex(chunk), originalCreatedBy); writer.endColumn(); } else { // Nothing changed, simply copy the binary data. BloomFilter bloomFilter = indexCache.getBloomFilter(chunk); ColumnIndex columnIndex = indexCache.getColumnIndex(chunk); OffsetIndex offsetIndex = indexCache.getOffsetIndex(chunk); writer.appendColumnChunk( descriptorRenamed, reader.getStream(), chunkNormalized, bloomFilter, columnIndex, offsetIndex); } } private void processChunk( TransParquetFileReader reader, long blockRowCount, ColumnChunkMetaData chunk, CompressionCodecName newCodecName, ColumnChunkEncryptorRunTime columnChunkEncryptorRunTime, boolean encryptColumn, BloomFilter bloomFilter, ColumnIndex columnIndex, OffsetIndex offsetIndex, String originalCreatedBy) throws IOException { CompressionCodecFactory codecFactory = HadoopCodecs.newFactory(0); CompressionCodecFactory.BytesInputDecompressor decompressor = null; CompressionCodecFactory.BytesInputCompressor compressor = null; if (!newCodecName.equals(chunk.getCodec())) { // Re-compress only if a different codec has been specified decompressor = codecFactory.getDecompressor(chunk.getCodec()); compressor = codecFactory.getCompressor(newCodecName); } // EncryptorRunTime is only provided when encryption is required BlockCipher.Encryptor metaEncryptor = null; BlockCipher.Encryptor dataEncryptor = null; byte[] dictPageAAD = null; byte[] dataPageAAD = null; byte[] dictPageHeaderAAD = null; byte[] dataPageHeaderAAD = null; if (columnChunkEncryptorRunTime != null) { metaEncryptor = columnChunkEncryptorRunTime.getMetaDataEncryptor(); dataEncryptor = columnChunkEncryptorRunTime.getDataEncryptor(); dictPageAAD = columnChunkEncryptorRunTime.getDictPageAAD(); dataPageAAD = columnChunkEncryptorRunTime.getDataPageAAD(); dictPageHeaderAAD = columnChunkEncryptorRunTime.getDictPageHeaderAAD(); dataPageHeaderAAD = columnChunkEncryptorRunTime.getDataPageHeaderAAD(); } if (bloomFilter != null) { writer.addBloomFilter(normalizeFieldsInPath(chunk.getPath()).toDotString(), bloomFilter); } reader.setStreamPosition(chunk.getStartingPos()); DictionaryPage dictionaryPage = null; long readValues = 0L; long readRows = 0L; Statistics statistics = null; boolean isColumnStatisticsMalformed = false; ParquetMetadataConverter converter = new ParquetMetadataConverter(); int pageOrdinal = 0; long totalChunkValues = chunk.getValueCount(); while (readValues < totalChunkValues) { PageHeader pageHeader = reader.readPageHeader(); int compressedPageSize = pageHeader.getCompressed_page_size(); byte[] pageLoad; switch (pageHeader.type) { case DICTIONARY_PAGE: if (dictionaryPage != null) { throw new IOException("has more than one dictionary page in column chunk: " + chunk); } // No quickUpdatePageAAD needed for dictionary page DictionaryPageHeader dictPageHeader = pageHeader.dictionary_page_header; pageLoad = processPageLoad( reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size(), encryptColumn, dataEncryptor, dictPageAAD); dictionaryPage = new DictionaryPage( BytesInput.from(pageLoad), pageHeader.getUncompressed_page_size(), dictPageHeader.getNum_values(), converter.getEncoding(dictPageHeader.getEncoding())); writer.writeDictionaryPage(dictionaryPage, metaEncryptor, dictPageHeaderAAD); break; case DATA_PAGE: if (encryptColumn) { AesCipher.quickUpdatePageAAD(dataPageHeaderAAD, pageOrdinal); AesCipher.quickUpdatePageAAD(dataPageAAD, pageOrdinal); } DataPageHeader headerV1 = pageHeader.data_page_header; pageLoad = processPageLoad( reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size(), encryptColumn, dataEncryptor, dataPageAAD); statistics = convertStatistics( originalCreatedBy, normalizeNameInType(chunk.getPrimitiveType()), headerV1.getStatistics(), columnIndex, pageOrdinal, converter); if (statistics == null) { // Reach here means both the columnIndex and the page header statistics are null isColumnStatisticsMalformed = true; } else { Preconditions.checkState( !isColumnStatisticsMalformed, "Detected mixed null page statistics and non-null page statistics"); } readValues += headerV1.getNum_values(); if (offsetIndex != null) { long rowCount = 1 + offsetIndex.getLastRowIndex(pageOrdinal, blockRowCount) - offsetIndex.getFirstRowIndex(pageOrdinal); readRows += rowCount; writer.writeDataPage( toIntWithCheck(headerV1.getNum_values()), pageHeader.getUncompressed_page_size(), BytesInput.from(pageLoad), statistics, toIntWithCheck(rowCount), converter.getEncoding(headerV1.getRepetition_level_encoding()), converter.getEncoding(headerV1.getDefinition_level_encoding()), converter.getEncoding(headerV1.getEncoding()), metaEncryptor, dataPageHeaderAAD); } else { writer.writeDataPage( toIntWithCheck(headerV1.getNum_values()), pageHeader.getUncompressed_page_size(), BytesInput.from(pageLoad), statistics, converter.getEncoding(headerV1.getRepetition_level_encoding()), converter.getEncoding(headerV1.getDefinition_level_encoding()), converter.getEncoding(headerV1.getEncoding()), metaEncryptor, dataPageHeaderAAD); } pageOrdinal++; break; case DATA_PAGE_V2: if (encryptColumn) { AesCipher.quickUpdatePageAAD(dataPageHeaderAAD, pageOrdinal); AesCipher.quickUpdatePageAAD(dataPageAAD, pageOrdinal); } DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2; int rlLength = headerV2.getRepetition_levels_byte_length(); BytesInput rlLevels = readBlockAllocate(rlLength, reader); int dlLength = headerV2.getDefinition_levels_byte_length(); BytesInput dlLevels = readBlockAllocate(dlLength, reader); int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength; int rawDataLength = pageHeader.getUncompressed_page_size() - rlLength - dlLength; pageLoad = processPageLoad( reader, headerV2.is_compressed, compressor, decompressor, payLoadLength, rawDataLength, encryptColumn, dataEncryptor, dataPageAAD); statistics = convertStatistics( originalCreatedBy, normalizeNameInType(chunk.getPrimitiveType()), headerV2.getStatistics(), columnIndex, pageOrdinal, converter); if (statistics == null) { // Reach here means both the columnIndex and the page header statistics are null isColumnStatisticsMalformed = true; } else { Preconditions.checkState( !isColumnStatisticsMalformed, "Detected mixed null page statistics and non-null page statistics"); } readValues += headerV2.getNum_values(); readRows += headerV2.getNum_rows(); writer.writeDataPageV2( headerV2.getNum_rows(), headerV2.getNum_nulls(), headerV2.getNum_values(), rlLevels, dlLevels, converter.getEncoding(headerV2.getEncoding()), BytesInput.from(pageLoad), rawDataLength, statistics, metaEncryptor, dataPageHeaderAAD); pageOrdinal++; break; default: LOG.debug("skipping page of type {} of size {}", pageHeader.getType(), compressedPageSize); break; } } Preconditions.checkState( readRows == 0 || readRows == blockRowCount, "Read row count: %s not match with block total row count: %s", readRows, blockRowCount); if (isColumnStatisticsMalformed) { // All the column statistics are invalid, so we need to overwrite the column statistics writer.invalidateStatistics(chunk.getStatistics()); } } private Statistics convertStatistics( String createdBy, PrimitiveType type, org.apache.parquet.format.Statistics pageStatistics, ColumnIndex columnIndex, int pageIndex, ParquetMetadataConverter converter) throws IOException { if (columnIndex != null) { if (columnIndex.getNullPages() == null) { throw new IOException( "columnIndex has null variable 'nullPages' which indicates corrupted data for type: " + type.getName()); } if (pageIndex > columnIndex.getNullPages().size()) { throw new IOException( "There are more pages " + pageIndex + " found in the column than in the columnIndex " + columnIndex.getNullPages().size()); } org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type); statsBuilder.withNumNulls(columnIndex.getNullCounts().get(pageIndex)); if (!columnIndex.getNullPages().get(pageIndex)) { statsBuilder.withMin( columnIndex.getMinValues().get(pageIndex).array().clone()); statsBuilder.withMax( columnIndex.getMaxValues().get(pageIndex).array().clone()); } return statsBuilder.build(); } else if (pageStatistics != null) { return converter.fromParquetStatistics(createdBy, pageStatistics, type); } else { return null; } } private byte[] processPageLoad( TransParquetFileReader reader, boolean isCompressed, CompressionCodecFactory.BytesInputCompressor compressor, CompressionCodecFactory.BytesInputDecompressor decompressor, int payloadLength, int rawDataLength, boolean encrypt, BlockCipher.Encryptor dataEncryptor, byte[] AAD) throws IOException { BytesInput data = readBlock(payloadLength, reader); // recompress page load if (compressor != null) { if (isCompressed) { data = decompressor.decompress(data, rawDataLength); } data = compressor.compress(data); } if (!encrypt) { return data.toByteArray(); } // encrypt page load return dataEncryptor.encrypt(data.toByteArray(), AAD); } public BytesInput readBlock(int length, TransParquetFileReader reader) throws IOException { byte[] data; if (length > pageBufferSize) { data = new byte[length]; } else { data = pageBuffer; } reader.blockRead(data, 0, length); return BytesInput.from(data, 0, length); } public BytesInput readBlockAllocate(int length, TransParquetFileReader reader) throws IOException { byte[] data = new byte[length]; reader.blockRead(data, 0, length); return BytesInput.from(data, 0, length); } private int toIntWithCheck(long size) { if ((int) size != size) { throw new ParquetEncodingException("size is bigger than " + Integer.MAX_VALUE + " bytes: " + size); } return (int) size; } // We have to rewrite getPaths because MessageType only get level 0 paths private void getPaths(GroupType schema, List paths, String parent) { List fields = schema.getFields(); String prefix = (parent == null) ? "" : parent + "."; for (Type field : fields) { paths.add(prefix + field.getName()); if (field instanceof GroupType) { getPaths(field.asGroupType(), paths, prefix + field.getName()); } } } private MessageType pruneColumnsInSchema(MessageType schema, List pruneColumns) { if (pruneColumns == null || pruneColumns.isEmpty()) { return schema; } else { List paths = new ArrayList<>(); getPaths(schema, paths, null); for (String col : pruneColumns) { if (!paths.contains(col)) { LOG.warn("Input column name {} doesn't show up in the schema", col); } } Set prunePaths = convertToColumnPaths(pruneColumns); List fields = schema.getFields(); List currentPath = new ArrayList<>(); List prunedFields = pruneColumnsInFields(fields, currentPath, prunePaths); return new MessageType(schema.getName(), prunedFields); } } private List pruneColumnsInFields(List fields, List currentPath, Set prunePaths) { List prunedFields = new ArrayList<>(); for (Type childField : fields) { Type prunedChildField = pruneColumnsInField(childField, currentPath, prunePaths); if (prunedChildField != null) { prunedFields.add(prunedChildField); } } return prunedFields; } private Type pruneColumnsInField(Type field, List currentPath, Set prunePaths) { String fieldName = field.getName(); currentPath.add(fieldName); ColumnPath path = ColumnPath.get(currentPath.toArray(new String[0])); Type prunedField = null; if (!prunePaths.contains(path)) { if (field.isPrimitive()) { prunedField = field; } else { List childFields = ((GroupType) field).getFields(); List prunedFields = pruneColumnsInFields(childFields, currentPath, prunePaths); if (!prunedFields.isEmpty()) { prunedField = ((GroupType) field).withNewFields(prunedFields); } } } currentPath.remove(currentPath.size() - 1); return prunedField; } private Set convertToColumnPaths(List cols) { Set prunePaths = new HashSet<>(); for (String col : cols) { prunePaths.add(ColumnPath.fromDotString(col)); } return prunePaths; } private void nullifyColumn( TransParquetFileReader reader, int blockIndex, ColumnDescriptor descriptor, ColumnChunkMetaData chunk, ParquetFileWriter writer, CompressionCodecName newCodecName, boolean encryptColumn, String originalCreatedBy) throws IOException { if (encryptColumn) { Preconditions.checkArgument(writer.getEncryptor() != null, "Missing encryptor"); } long totalChunkValues = chunk.getValueCount(); int dMax = descriptor.getMaxDefinitionLevel(); PageReadStore pageReadStore = reader.readRowGroup(blockIndex); ColumnReadStoreImpl crStore = new ColumnReadStoreImpl(pageReadStore, new DummyGroupConverter(), outSchema, originalCreatedBy); ColumnReader cReader = crStore.getColumnReader(descriptor); ParquetProperties.WriterVersion writerVersion = chunk.getEncodingStats().usesV2Pages() ? ParquetProperties.WriterVersion.PARQUET_2_0 : ParquetProperties.WriterVersion.PARQUET_1_0; ParquetProperties props = ParquetProperties.builder().withWriterVersion(writerVersion).build(); CodecFactory codecFactory = new CodecFactory(new Configuration(), props.getPageSizeThreshold()); CompressionCodecFactory.BytesInputCompressor compressor = codecFactory.getCompressor(newCodecName); // Create new schema that only has the current column MessageType newSchema = getSchemaWithRenamedColumns(newSchema(outSchema, descriptor)); ColumnChunkPageWriteStore cPageStore = new ColumnChunkPageWriteStore( compressor, newSchema, props.getAllocator(), props.getColumnIndexTruncateLength(), props.getPageWriteChecksumEnabled(), nullColumnEncryptor, numBlocksRewritten); ColumnWriteStore cStore = props.newColumnWriteStore(newSchema, cPageStore); ColumnWriter cWriter = cStore.getColumnWriter(descriptor); for (int i = 0; i < totalChunkValues; i++) { int rlvl = cReader.getCurrentRepetitionLevel(); int dlvl = cReader.getCurrentDefinitionLevel(); if (dlvl == dMax) { // since we checked ether optional or repeated, dlvl should be > 0 if (dlvl == 0) { throw new IOException("definition level is detected to be 0 for column " + chunk.getPath().toDotString() + " to be nullified"); } // we just write one null for the whole list at the top level, // instead of nullify the elements in the list one by one if (rlvl == 0) { cWriter.writeNull(rlvl, dlvl - 1); } } else { cWriter.writeNull(rlvl, dlvl); } cStore.endRecord(); } pageReadStore.close(); cStore.flush(); cPageStore.flushToFileWriter(writer); cStore.close(); cWriter.close(); } private MessageType newSchema(MessageType schema, ColumnDescriptor descriptor) { String[] path = descriptor.getPath(); Type type = schema.getType(path); if (path.length == 1) { return new MessageType(schema.getName(), type); } for (Type field : schema.getFields()) { if (!field.isPrimitive()) { Type newType = extractField(field.asGroupType(), type); if (newType != null) { return new MessageType(schema.getName(), newType); } } } // We should never hit this because 'type' is returned by schema.getType(). throw new RuntimeException("No field is found"); } private Type extractField(GroupType candidate, Type targetField) { if (targetField.equals(candidate)) { return targetField; } // In case 'type' is a descendants of candidate for (Type field : candidate.asGroupType().getFields()) { if (field.isPrimitive()) { if (field.equals(targetField)) { return new GroupType(candidate.getRepetition(), candidate.getName(), targetField); } } else { Type tempField = extractField(field.asGroupType(), targetField); if (tempField != null) { return new GroupType(candidate.getRepetition(), candidate.getName(), tempField); } } } return null; } private static final class DummyGroupConverter extends GroupConverter { @Override public void start() {} @Override public void end() {} @Override public Converter getConverter(int fieldIndex) { return new DummyConverter(); } } private static final class DummyConverter extends PrimitiveConverter { @Override public GroupConverter asGroupConverter() { return new DummyGroupConverter(); } } private static class ColumnChunkEncryptorRunTime { private final InternalColumnEncryptionSetup colEncrSetup; private final BlockCipher.Encryptor dataEncryptor; private final BlockCipher.Encryptor metaDataEncryptor; private final byte[] fileAAD; private final byte[] dataPageHeaderAAD; private final byte[] dataPageAAD; private final byte[] dictPageHeaderAAD; private final byte[] dictPageAAD; public ColumnChunkEncryptorRunTime( InternalFileEncryptor fileEncryptor, ColumnChunkMetaData chunk, int blockId, int columnId) throws IOException { Preconditions.checkArgument( fileEncryptor != null, "FileEncryptor is required to create ColumnChunkEncryptorRunTime"); this.colEncrSetup = fileEncryptor.getColumnSetup(chunk.getPath(), true, columnId); this.dataEncryptor = colEncrSetup.getDataEncryptor(); this.metaDataEncryptor = colEncrSetup.getMetaDataEncryptor(); this.fileAAD = fileEncryptor.getFileAAD(); if (colEncrSetup != null && colEncrSetup.isEncrypted()) { this.dataPageHeaderAAD = createAAD(ModuleType.DataPageHeader, blockId, columnId); this.dataPageAAD = createAAD(ModuleType.DataPage, blockId, columnId); this.dictPageHeaderAAD = createAAD(ModuleType.DictionaryPageHeader, blockId, columnId); this.dictPageAAD = createAAD(ModuleType.DictionaryPage, blockId, columnId); } else { this.dataPageHeaderAAD = null; this.dataPageAAD = null; this.dictPageHeaderAAD = null; this.dictPageAAD = null; } } private byte[] createAAD(ModuleType moduleType, int blockId, int columnId) { return AesCipher.createModuleAAD(fileAAD, moduleType, blockId, columnId, 0); } public BlockCipher.Encryptor getDataEncryptor() { return this.dataEncryptor; } public BlockCipher.Encryptor getMetaDataEncryptor() { return this.metaDataEncryptor; } public byte[] getDataPageHeaderAAD() { return this.dataPageHeaderAAD; } public byte[] getDataPageAAD() { return this.dataPageAAD; } public byte[] getDictPageHeaderAAD() { return this.dictPageHeaderAAD; } public byte[] getDictPageAAD() { return this.dictPageAAD; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy