org.apache.parquet.hadoop.rewrite.ParquetRewriter Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.hadoop.rewrite;
import static org.apache.parquet.column.ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH;
import static org.apache.parquet.column.ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH;
import static org.apache.parquet.crypto.ModuleCipherFactory.ModuleType;
import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Queue;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.ParquetReadOptions;
import org.apache.parquet.Preconditions;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnReader;
import org.apache.parquet.column.ColumnWriteStore;
import org.apache.parquet.column.ColumnWriter;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.impl.ColumnReadStoreImpl;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
import org.apache.parquet.compression.CompressionCodecFactory;
import org.apache.parquet.conf.ParquetConfiguration;
import org.apache.parquet.crypto.AesCipher;
import org.apache.parquet.crypto.InternalColumnEncryptionSetup;
import org.apache.parquet.crypto.InternalFileEncryptor;
import org.apache.parquet.format.BlockCipher;
import org.apache.parquet.format.DataPageHeader;
import org.apache.parquet.format.DataPageHeaderV2;
import org.apache.parquet.format.DictionaryPageHeader;
import org.apache.parquet.format.PageHeader;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.CodecFactory;
import org.apache.parquet.hadoop.ColumnChunkPageWriteStore;
import org.apache.parquet.hadoop.IndexCache;
import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader;
import org.apache.parquet.hadoop.util.HadoopCodecs;
import org.apache.parquet.internal.column.columnindex.ColumnIndex;
import org.apache.parquet.internal.column.columnindex.OffsetIndex;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.io.OutputFile;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.io.api.Converter;
import org.apache.parquet.io.api.GroupConverter;
import org.apache.parquet.io.api.PrimitiveConverter;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.InvalidSchemaException;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Rewrites multiple input files into a single output file.
*
* Supported functionality:
*
* - Merging multiple files into a single one
* - Applying column transformations
* - Joining with extra files with a different schema
*
*
* Note that the total number of row groups from all input files is preserved in the output file.
* This may not be optimal if row groups are very small and will not solve small file problems. Instead, it will
* make it worse to have a large file footer in the output file.
*
*
Merging multiple files into a single output files
* Use {@link RewriteOptions.Builder}'s constructor or methods to provide inputFiles
.
* Please note the schema of all inputFiles
must be the same, otherwise the rewrite will fail.
*
*
Applying column transformations
* Some supported column transformations: pruning, masking, renaming, encrypting, changing a codec.
* See {@link RewriteOptions} and {@link RewriteOptions.Builder} for the full list with description.
*
*
Joining with extra files with a different schema
* Use {@link RewriteOptions.Builder}'s constructor or methods to provide inputFilesToJoin
.
* Please note the schema of all inputFilesToJoin
must be the same, otherwise the rewrite will fail.
* Requirements for a joining the main inputFiles
(left) and inputFilesToJoin
(right):
*
* - the number of files might be different on the left and right,
* - the schema of files inside of each group(left/right) must be the same, but those two schemas not necessarily should be equal,
* - the total number of row groups must be the same on the left and right,
* - the total number of rows must be the same on the left and right,
* - the global ordering of rows must be the same on the left and right.
*
*/
public class ParquetRewriter implements Closeable {
// Key to store original writer version in the file key-value metadata
public static final String ORIGINAL_CREATED_BY_KEY = "original.created.by";
private static final Logger LOG = LoggerFactory.getLogger(ParquetRewriter.class);
private final int pageBufferSize = ParquetProperties.DEFAULT_PAGE_SIZE * 2;
private final byte[] pageBuffer = new byte[pageBufferSize];
// Configurations for the new file
private final CompressionCodecName newCodecName;
private Map maskColumns = null;
private Set encryptColumns = null;
private boolean encryptMode = false;
private final Map extraMetaData;
// Writer to rewrite the input files
private final ParquetFileWriter writer;
// Number of blocks written which is used to keep track of the actual row group ordinal
private int numBlocksRewritten = 0;
// Reader and relevant states of the in-processing input file
private final Queue inputFiles = new LinkedList<>();
private final Queue inputFilesToJoin = new LinkedList<>();
private final MessageType outSchema;
// The index cache strategy
private final IndexCache.CacheStrategy indexCacheStrategy;
private final boolean overwriteInputWithJoinColumns;
private final InternalFileEncryptor nullColumnEncryptor;
private final Map renamedColumns;
public ParquetRewriter(RewriteOptions options) throws IOException {
this.newCodecName = options.getNewCodecName();
this.indexCacheStrategy = options.getIndexCacheStrategy();
this.overwriteInputWithJoinColumns = options.getOverwriteInputWithJoinColumns();
this.renamedColumns = options.getRenameColumns();
ParquetConfiguration conf = options.getParquetConfiguration();
this.inputFiles.addAll(getFileReaders(options.getParquetInputFiles(), conf));
this.inputFilesToJoin.addAll(getFileReaders(options.getParquetInputFilesToJoin(), conf));
this.outSchema = pruneColumnsInSchema(getSchema(), options.getPruneColumns());
this.extraMetaData = getExtraMetadata(options);
ensureSameSchema(inputFiles);
ensureSameSchema(inputFilesToJoin);
ensureRowCount();
ensureRenamingCorrectness(outSchema, renamedColumns);
OutputFile out = options.getParquetOutputFile();
LOG.info(
"Start rewriting {} input file(s) {} to {}",
inputFiles.size() + inputFilesToJoin.size(),
Stream.concat(options.getParquetInputFiles().stream(), options.getParquetInputFilesToJoin().stream())
.collect(Collectors.toList()),
out);
if (options.getMaskColumns() != null) {
this.maskColumns = new HashMap<>();
for (Map.Entry col : options.getMaskColumns().entrySet()) {
maskColumns.put(ColumnPath.fromDotString(col.getKey()), col.getValue());
}
}
if (options.getEncryptColumns() != null && options.getFileEncryptionProperties() != null) {
this.encryptColumns = convertToColumnPaths(options.getEncryptColumns());
this.encryptMode = true;
}
ParquetFileWriter.Mode writerMode = ParquetFileWriter.Mode.CREATE;
this.writer = new ParquetFileWriter(
out,
renamedColumns.isEmpty() ? outSchema : getSchemaWithRenamedColumns(this.outSchema),
writerMode,
DEFAULT_BLOCK_SIZE,
MAX_PADDING_SIZE_DEFAULT,
DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
DEFAULT_STATISTICS_TRUNCATE_LENGTH,
ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED,
options.getFileEncryptionProperties());
writer.start();
// column nullification requires a separate encryptor and forcing other columns encryption initialization
if (options.getFileEncryptionProperties() == null) {
this.nullColumnEncryptor = null;
} else {
this.nullColumnEncryptor = new InternalFileEncryptor(options.getFileEncryptionProperties());
List columns =
getSchemaWithRenamedColumns(this.outSchema).getColumns();
for (int i = 0; i < columns.size(); i++) {
writer.getEncryptor()
.getColumnSetup(ColumnPath.get(columns.get(i).getPath()), true, i);
}
}
}
// TODO: Should we mark it as deprecated to encourage the main constructor usage? it is also used only from
// deprecated classes atm
// Ctor for legacy CompressionConverter and ColumnMasker
public ParquetRewriter(
TransParquetFileReader reader,
ParquetFileWriter writer,
ParquetMetadata meta,
MessageType outSchema,
String originalCreatedBy,
CompressionCodecName codecName,
List maskColumns,
MaskMode maskMode) {
this.writer = writer;
this.outSchema = outSchema;
this.newCodecName = codecName;
this.extraMetaData = new HashMap<>(meta.getFileMetaData().getKeyValueMetaData());
this.extraMetaData.put(
ORIGINAL_CREATED_BY_KEY,
originalCreatedBy != null
? originalCreatedBy
: meta.getFileMetaData().getCreatedBy());
if (maskColumns != null && maskMode != null) {
this.maskColumns = new HashMap<>();
for (String col : maskColumns) {
this.maskColumns.put(ColumnPath.fromDotString(col), maskMode);
}
}
this.inputFiles.add(reader);
this.indexCacheStrategy = IndexCache.CacheStrategy.NONE;
this.overwriteInputWithJoinColumns = false;
this.nullColumnEncryptor = null;
this.renamedColumns = new HashMap<>();
}
private MessageType getSchema() {
MessageType schemaMain = inputFiles.peek().getFooter().getFileMetaData().getSchema();
if (inputFilesToJoin.isEmpty()) {
return schemaMain;
} else {
Map fieldNames = new LinkedHashMap<>();
schemaMain.getFields().forEach(x -> fieldNames.put(x.getName(), x));
inputFilesToJoin
.peek()
.getFooter()
.getFileMetaData()
.getSchema()
.getFields()
.forEach(x -> {
if (!fieldNames.containsKey(x.getName())) {
fieldNames.put(x.getName(), x);
} else if (overwriteInputWithJoinColumns) {
LOG.info("Column {} in inputFiles is overwritten by inputFilesToJoin side", x.getName());
fieldNames.put(x.getName(), x);
}
});
return new MessageType(schemaMain.getName(), new ArrayList<>(fieldNames.values()));
}
}
private MessageType getSchemaWithRenamedColumns(MessageType schema) {
List fields = schema.getFields().stream()
.map(type -> {
if (!renamedColumns.containsKey(type.getName())) {
return type;
} else if (type.isPrimitive()) {
return new PrimitiveType(
type.getRepetition(),
type.asPrimitiveType().getPrimitiveTypeName(),
renamedColumns.get(type.getName()));
} else {
return new GroupType(
type.getRepetition(),
renamedColumns.get(type.getName()),
type.asGroupType().getFields());
}
})
.collect(Collectors.toList());
return new MessageType(schema.getName(), fields);
}
private Map getExtraMetadata(RewriteOptions options) {
List allFiles;
if (options.getIgnoreJoinFilesMetadata()) {
allFiles = new ArrayList<>(inputFiles);
} else {
allFiles = Stream.concat(inputFiles.stream(), inputFilesToJoin.stream())
.collect(Collectors.toList());
}
Map result = new HashMap<>();
result.put(
ORIGINAL_CREATED_BY_KEY,
allFiles.stream()
.map(x -> x.getFooter().getFileMetaData().getCreatedBy())
.collect(Collectors.toSet())
.stream()
.reduce((a, b) -> a + "\n" + b)
.orElse(""));
allFiles.forEach(x -> result.putAll(x.getFileMetaData().getKeyValueMetaData()));
return result;
}
private void ensureRowCount() {
if (!inputFilesToJoin.isEmpty()) {
List blocksRowCountsL = inputFiles.stream()
.flatMap(x -> x.getFooter().getBlocks().stream().map(BlockMetaData::getRowCount))
.collect(Collectors.toList());
List blocksRowCountsR = inputFilesToJoin.stream()
.flatMap(x -> x.getFooter().getBlocks().stream().map(BlockMetaData::getRowCount))
.collect(Collectors.toList());
if (!blocksRowCountsL.equals(blocksRowCountsR)) {
throw new IllegalArgumentException(
"The number of rows in each block must match! Left blocks row counts: " + blocksRowCountsL
+ ", right blocks row counts" + blocksRowCountsR + ".");
}
}
}
private Queue getFileReaders(List inputFiles, ParquetConfiguration conf) {
LinkedList inputFileReaders = new LinkedList<>();
for (InputFile inputFile : inputFiles) {
try {
TransParquetFileReader reader = new TransParquetFileReader(
inputFile, ParquetReadOptions.builder(conf).build());
inputFileReaders.add(reader);
} catch (IOException e) {
throw new IllegalArgumentException("Failed to open input file: " + inputFile, e);
}
}
return inputFileReaders;
}
private void ensureSameSchema(Queue inputFileReaders) {
MessageType schema = null;
for (TransParquetFileReader reader : inputFileReaders) {
MessageType newSchema = reader.getFooter().getFileMetaData().getSchema();
if (schema == null) {
schema = newSchema;
} else {
// Now we enforce equality of schemas from input files for simplicity.
if (!schema.equals(newSchema)) {
String file = reader.getFile();
LOG.error(
"Input files have different schemas, expect: {}, input: {}, current file: {}",
schema,
newSchema,
file);
throw new InvalidSchemaException("Input files have different schemas, current file: " + file);
}
}
}
}
private void ensureRenamingCorrectness(MessageType schema, Map renameMap) {
Set columns = schema.getFields().stream().map(Type::getName).collect(Collectors.toSet());
renameMap.forEach((src, dst) -> {
if (!columns.contains(src)) {
String msg = String.format("Column to rename '%s' is not found in input files schema", src);
LOG.error(msg);
throw new IllegalArgumentException(msg);
} else if (columns.contains(dst)) {
String msg = String.format("Renamed column target name '%s' is already present in a schema", dst);
LOG.error(msg);
throw new IllegalArgumentException(msg);
}
});
}
@Override
public void close() throws IOException {
writer.end(extraMetaData);
}
public void processBlocks() throws IOException {
TransParquetFileReader readerToJoin = null;
IndexCache indexCacheToJoin = null;
int blockIdxToJoin = 0;
List outColumns = outSchema.getColumns();
while (!inputFiles.isEmpty()) {
TransParquetFileReader reader = inputFiles.poll();
LOG.info("Rewriting input file: {}, remaining files: {}", reader.getFile(), inputFiles.size());
ParquetMetadata meta = reader.getFooter();
Set columnPaths = meta.getFileMetaData().getSchema().getColumns().stream()
.map(x -> ColumnPath.get(x.getPath()))
.collect(Collectors.toSet());
IndexCache indexCache = IndexCache.create(reader, columnPaths, indexCacheStrategy, true);
for (int blockIdx = 0; blockIdx < meta.getBlocks().size(); blockIdx++) {
BlockMetaData blockMetaData = meta.getBlocks().get(blockIdx);
writer.startBlock(blockMetaData.getRowCount());
indexCache.setBlockMetadata(blockMetaData);
Map pathToChunk =
blockMetaData.getColumns().stream().collect(Collectors.toMap(x -> x.getPath(), x -> x));
if (!inputFilesToJoin.isEmpty()) {
if (readerToJoin == null
|| ++blockIdxToJoin
== readerToJoin.getFooter().getBlocks().size()) {
if (readerToJoin != null) readerToJoin.close();
blockIdxToJoin = 0;
readerToJoin = inputFilesToJoin.poll();
Set columnPathsToJoin =
readerToJoin.getFileMetaData().getSchema().getColumns().stream()
.map(x -> ColumnPath.get(x.getPath()))
.collect(Collectors.toSet());
if (indexCacheToJoin != null) {
indexCacheToJoin.clean();
}
indexCacheToJoin = IndexCache.create(readerToJoin, columnPathsToJoin, indexCacheStrategy, true);
indexCacheToJoin.setBlockMetadata(
readerToJoin.getFooter().getBlocks().get(blockIdxToJoin));
} else {
blockIdxToJoin++;
indexCacheToJoin.setBlockMetadata(
readerToJoin.getFooter().getBlocks().get(blockIdxToJoin));
}
}
for (int outColumnIdx = 0; outColumnIdx < outColumns.size(); outColumnIdx++) {
ColumnPath colPath =
ColumnPath.get(outColumns.get(outColumnIdx).getPath());
if (readerToJoin != null) {
Optional chunkToJoin =
readerToJoin.getFooter().getBlocks().get(blockIdxToJoin).getColumns().stream()
.filter(x -> x.getPath().equals(colPath))
.findFirst();
if (chunkToJoin.isPresent()
&& (overwriteInputWithJoinColumns || !columnPaths.contains(colPath))) {
processBlock(
readerToJoin, blockIdxToJoin, outColumnIdx, indexCacheToJoin, chunkToJoin.get());
} else {
processBlock(reader, blockIdx, outColumnIdx, indexCache, pathToChunk.get(colPath));
}
} else {
processBlock(reader, blockIdx, outColumnIdx, indexCache, pathToChunk.get(colPath));
}
}
writer.endBlock();
indexCache.clean();
numBlocksRewritten++;
}
indexCache.clean();
LOG.info("Finish rewriting input file: {}", reader.getFile());
reader.close();
}
if (readerToJoin != null) readerToJoin.close();
}
private ColumnPath normalizeFieldsInPath(ColumnPath path) {
if (renamedColumns.isEmpty()) {
return path;
} else {
String[] pathArray = path.toArray();
pathArray[0] = renamedColumns.getOrDefault(pathArray[0], pathArray[0]);
return ColumnPath.get(pathArray);
}
}
private PrimitiveType normalizeNameInType(PrimitiveType type) {
if (renamedColumns.isEmpty()) {
return type;
} else {
return new PrimitiveType(
type.getRepetition(),
type.asPrimitiveType().getPrimitiveTypeName(),
renamedColumns.getOrDefault(type.getName(), type.getName()));
}
}
private void processBlock(
TransParquetFileReader reader,
int blockIdx,
int outColumnIdx,
IndexCache indexCache,
ColumnChunkMetaData chunk)
throws IOException {
if (chunk.isEncrypted()) {
throw new IOException("Column " + chunk.getPath().toDotString() + " is already encrypted");
}
ColumnChunkMetaData chunkNormalized = chunk;
if (!renamedColumns.isEmpty()) {
// Keep an eye if this get stale because of ColumnChunkMetaData change
chunkNormalized = ColumnChunkMetaData.get(
normalizeFieldsInPath(chunk.getPath()),
normalizeNameInType(chunk.getPrimitiveType()),
chunk.getCodec(),
chunk.getEncodingStats(),
chunk.getEncodings(),
chunk.getStatistics(),
chunk.getFirstDataPageOffset(),
chunk.getDictionaryPageOffset(),
chunk.getValueCount(),
chunk.getTotalSize(),
chunk.getTotalUncompressedSize(),
chunk.getSizeStatistics());
}
ColumnDescriptor descriptorOriginal = outSchema.getColumns().get(outColumnIdx);
ColumnDescriptor descriptorRenamed =
getSchemaWithRenamedColumns(outSchema).getColumns().get(outColumnIdx);
BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(blockIdx);
String originalCreatedBy = reader.getFileMetaData().getCreatedBy();
reader.setStreamPosition(chunk.getStartingPos());
CompressionCodecName newCodecName = this.newCodecName == null ? chunk.getCodec() : this.newCodecName;
boolean encryptColumn = encryptMode && encryptColumns != null && encryptColumns.contains(chunk.getPath());
if (maskColumns != null && maskColumns.containsKey(chunk.getPath())) {
// Mask column and compress it again.
MaskMode maskMode = maskColumns.get(chunk.getPath());
if (maskMode.equals(MaskMode.NULLIFY)) {
Type.Repetition repetition =
descriptorOriginal.getPrimitiveType().getRepetition();
if (repetition.equals(Type.Repetition.REQUIRED)) {
throw new IOException("Required column ["
+ descriptorOriginal.getPrimitiveType().getName() + "] cannot be nullified");
}
nullifyColumn(
reader,
blockIdx,
descriptorOriginal,
chunk,
writer,
newCodecName,
encryptColumn,
originalCreatedBy);
} else {
throw new UnsupportedOperationException("Only nullify is supported for now");
}
} else if (encryptMode || this.newCodecName != null) {
// Prepare encryption context
ColumnChunkEncryptorRunTime columnChunkEncryptorRunTime = null;
if (encryptMode) {
columnChunkEncryptorRunTime =
new ColumnChunkEncryptorRunTime(writer.getEncryptor(), chunk, numBlocksRewritten, outColumnIdx);
}
// Translate compression and/or encryption
writer.startColumn(descriptorRenamed, chunk.getValueCount(), newCodecName);
processChunk(
reader,
blockMetaData.getRowCount(),
chunk,
newCodecName,
columnChunkEncryptorRunTime,
encryptColumn,
indexCache.getBloomFilter(chunk),
indexCache.getColumnIndex(chunk),
indexCache.getOffsetIndex(chunk),
originalCreatedBy);
writer.endColumn();
} else {
// Nothing changed, simply copy the binary data.
BloomFilter bloomFilter = indexCache.getBloomFilter(chunk);
ColumnIndex columnIndex = indexCache.getColumnIndex(chunk);
OffsetIndex offsetIndex = indexCache.getOffsetIndex(chunk);
writer.appendColumnChunk(
descriptorRenamed, reader.getStream(), chunkNormalized, bloomFilter, columnIndex, offsetIndex);
}
}
private void processChunk(
TransParquetFileReader reader,
long blockRowCount,
ColumnChunkMetaData chunk,
CompressionCodecName newCodecName,
ColumnChunkEncryptorRunTime columnChunkEncryptorRunTime,
boolean encryptColumn,
BloomFilter bloomFilter,
ColumnIndex columnIndex,
OffsetIndex offsetIndex,
String originalCreatedBy)
throws IOException {
CompressionCodecFactory codecFactory = HadoopCodecs.newFactory(0);
CompressionCodecFactory.BytesInputDecompressor decompressor = null;
CompressionCodecFactory.BytesInputCompressor compressor = null;
if (!newCodecName.equals(chunk.getCodec())) {
// Re-compress only if a different codec has been specified
decompressor = codecFactory.getDecompressor(chunk.getCodec());
compressor = codecFactory.getCompressor(newCodecName);
}
// EncryptorRunTime is only provided when encryption is required
BlockCipher.Encryptor metaEncryptor = null;
BlockCipher.Encryptor dataEncryptor = null;
byte[] dictPageAAD = null;
byte[] dataPageAAD = null;
byte[] dictPageHeaderAAD = null;
byte[] dataPageHeaderAAD = null;
if (columnChunkEncryptorRunTime != null) {
metaEncryptor = columnChunkEncryptorRunTime.getMetaDataEncryptor();
dataEncryptor = columnChunkEncryptorRunTime.getDataEncryptor();
dictPageAAD = columnChunkEncryptorRunTime.getDictPageAAD();
dataPageAAD = columnChunkEncryptorRunTime.getDataPageAAD();
dictPageHeaderAAD = columnChunkEncryptorRunTime.getDictPageHeaderAAD();
dataPageHeaderAAD = columnChunkEncryptorRunTime.getDataPageHeaderAAD();
}
if (bloomFilter != null) {
writer.addBloomFilter(normalizeFieldsInPath(chunk.getPath()).toDotString(), bloomFilter);
}
reader.setStreamPosition(chunk.getStartingPos());
DictionaryPage dictionaryPage = null;
long readValues = 0L;
long readRows = 0L;
Statistics> statistics = null;
boolean isColumnStatisticsMalformed = false;
ParquetMetadataConverter converter = new ParquetMetadataConverter();
int pageOrdinal = 0;
long totalChunkValues = chunk.getValueCount();
while (readValues < totalChunkValues) {
PageHeader pageHeader = reader.readPageHeader();
int compressedPageSize = pageHeader.getCompressed_page_size();
byte[] pageLoad;
switch (pageHeader.type) {
case DICTIONARY_PAGE:
if (dictionaryPage != null) {
throw new IOException("has more than one dictionary page in column chunk: " + chunk);
}
// No quickUpdatePageAAD needed for dictionary page
DictionaryPageHeader dictPageHeader = pageHeader.dictionary_page_header;
pageLoad = processPageLoad(
reader,
true,
compressor,
decompressor,
pageHeader.getCompressed_page_size(),
pageHeader.getUncompressed_page_size(),
encryptColumn,
dataEncryptor,
dictPageAAD);
dictionaryPage = new DictionaryPage(
BytesInput.from(pageLoad),
pageHeader.getUncompressed_page_size(),
dictPageHeader.getNum_values(),
converter.getEncoding(dictPageHeader.getEncoding()));
writer.writeDictionaryPage(dictionaryPage, metaEncryptor, dictPageHeaderAAD);
break;
case DATA_PAGE:
if (encryptColumn) {
AesCipher.quickUpdatePageAAD(dataPageHeaderAAD, pageOrdinal);
AesCipher.quickUpdatePageAAD(dataPageAAD, pageOrdinal);
}
DataPageHeader headerV1 = pageHeader.data_page_header;
pageLoad = processPageLoad(
reader,
true,
compressor,
decompressor,
pageHeader.getCompressed_page_size(),
pageHeader.getUncompressed_page_size(),
encryptColumn,
dataEncryptor,
dataPageAAD);
statistics = convertStatistics(
originalCreatedBy,
normalizeNameInType(chunk.getPrimitiveType()),
headerV1.getStatistics(),
columnIndex,
pageOrdinal,
converter);
if (statistics == null) {
// Reach here means both the columnIndex and the page header statistics are null
isColumnStatisticsMalformed = true;
} else {
Preconditions.checkState(
!isColumnStatisticsMalformed,
"Detected mixed null page statistics and non-null page statistics");
}
readValues += headerV1.getNum_values();
if (offsetIndex != null) {
long rowCount = 1
+ offsetIndex.getLastRowIndex(pageOrdinal, blockRowCount)
- offsetIndex.getFirstRowIndex(pageOrdinal);
readRows += rowCount;
writer.writeDataPage(
toIntWithCheck(headerV1.getNum_values()),
pageHeader.getUncompressed_page_size(),
BytesInput.from(pageLoad),
statistics,
toIntWithCheck(rowCount),
converter.getEncoding(headerV1.getRepetition_level_encoding()),
converter.getEncoding(headerV1.getDefinition_level_encoding()),
converter.getEncoding(headerV1.getEncoding()),
metaEncryptor,
dataPageHeaderAAD);
} else {
writer.writeDataPage(
toIntWithCheck(headerV1.getNum_values()),
pageHeader.getUncompressed_page_size(),
BytesInput.from(pageLoad),
statistics,
converter.getEncoding(headerV1.getRepetition_level_encoding()),
converter.getEncoding(headerV1.getDefinition_level_encoding()),
converter.getEncoding(headerV1.getEncoding()),
metaEncryptor,
dataPageHeaderAAD);
}
pageOrdinal++;
break;
case DATA_PAGE_V2:
if (encryptColumn) {
AesCipher.quickUpdatePageAAD(dataPageHeaderAAD, pageOrdinal);
AesCipher.quickUpdatePageAAD(dataPageAAD, pageOrdinal);
}
DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2;
int rlLength = headerV2.getRepetition_levels_byte_length();
BytesInput rlLevels = readBlockAllocate(rlLength, reader);
int dlLength = headerV2.getDefinition_levels_byte_length();
BytesInput dlLevels = readBlockAllocate(dlLength, reader);
int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength;
int rawDataLength = pageHeader.getUncompressed_page_size() - rlLength - dlLength;
pageLoad = processPageLoad(
reader,
headerV2.is_compressed,
compressor,
decompressor,
payLoadLength,
rawDataLength,
encryptColumn,
dataEncryptor,
dataPageAAD);
statistics = convertStatistics(
originalCreatedBy,
normalizeNameInType(chunk.getPrimitiveType()),
headerV2.getStatistics(),
columnIndex,
pageOrdinal,
converter);
if (statistics == null) {
// Reach here means both the columnIndex and the page header statistics are null
isColumnStatisticsMalformed = true;
} else {
Preconditions.checkState(
!isColumnStatisticsMalformed,
"Detected mixed null page statistics and non-null page statistics");
}
readValues += headerV2.getNum_values();
readRows += headerV2.getNum_rows();
writer.writeDataPageV2(
headerV2.getNum_rows(),
headerV2.getNum_nulls(),
headerV2.getNum_values(),
rlLevels,
dlLevels,
converter.getEncoding(headerV2.getEncoding()),
BytesInput.from(pageLoad),
rawDataLength,
statistics,
metaEncryptor,
dataPageHeaderAAD);
pageOrdinal++;
break;
default:
LOG.debug("skipping page of type {} of size {}", pageHeader.getType(), compressedPageSize);
break;
}
}
Preconditions.checkState(
readRows == 0 || readRows == blockRowCount,
"Read row count: %s not match with block total row count: %s",
readRows,
blockRowCount);
if (isColumnStatisticsMalformed) {
// All the column statistics are invalid, so we need to overwrite the column statistics
writer.invalidateStatistics(chunk.getStatistics());
}
}
private Statistics> convertStatistics(
String createdBy,
PrimitiveType type,
org.apache.parquet.format.Statistics pageStatistics,
ColumnIndex columnIndex,
int pageIndex,
ParquetMetadataConverter converter)
throws IOException {
if (columnIndex != null) {
if (columnIndex.getNullPages() == null) {
throw new IOException(
"columnIndex has null variable 'nullPages' which indicates corrupted data for type: "
+ type.getName());
}
if (pageIndex > columnIndex.getNullPages().size()) {
throw new IOException(
"There are more pages " + pageIndex + " found in the column than in the columnIndex "
+ columnIndex.getNullPages().size());
}
org.apache.parquet.column.statistics.Statistics.Builder statsBuilder =
org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type);
statsBuilder.withNumNulls(columnIndex.getNullCounts().get(pageIndex));
if (!columnIndex.getNullPages().get(pageIndex)) {
statsBuilder.withMin(
columnIndex.getMinValues().get(pageIndex).array().clone());
statsBuilder.withMax(
columnIndex.getMaxValues().get(pageIndex).array().clone());
}
return statsBuilder.build();
} else if (pageStatistics != null) {
return converter.fromParquetStatistics(createdBy, pageStatistics, type);
} else {
return null;
}
}
private byte[] processPageLoad(
TransParquetFileReader reader,
boolean isCompressed,
CompressionCodecFactory.BytesInputCompressor compressor,
CompressionCodecFactory.BytesInputDecompressor decompressor,
int payloadLength,
int rawDataLength,
boolean encrypt,
BlockCipher.Encryptor dataEncryptor,
byte[] AAD)
throws IOException {
BytesInput data = readBlock(payloadLength, reader);
// recompress page load
if (compressor != null) {
if (isCompressed) {
data = decompressor.decompress(data, rawDataLength);
}
data = compressor.compress(data);
}
if (!encrypt) {
return data.toByteArray();
}
// encrypt page load
return dataEncryptor.encrypt(data.toByteArray(), AAD);
}
public BytesInput readBlock(int length, TransParquetFileReader reader) throws IOException {
byte[] data;
if (length > pageBufferSize) {
data = new byte[length];
} else {
data = pageBuffer;
}
reader.blockRead(data, 0, length);
return BytesInput.from(data, 0, length);
}
public BytesInput readBlockAllocate(int length, TransParquetFileReader reader) throws IOException {
byte[] data = new byte[length];
reader.blockRead(data, 0, length);
return BytesInput.from(data, 0, length);
}
private int toIntWithCheck(long size) {
if ((int) size != size) {
throw new ParquetEncodingException("size is bigger than " + Integer.MAX_VALUE + " bytes: " + size);
}
return (int) size;
}
// We have to rewrite getPaths because MessageType only get level 0 paths
private void getPaths(GroupType schema, List paths, String parent) {
List fields = schema.getFields();
String prefix = (parent == null) ? "" : parent + ".";
for (Type field : fields) {
paths.add(prefix + field.getName());
if (field instanceof GroupType) {
getPaths(field.asGroupType(), paths, prefix + field.getName());
}
}
}
private MessageType pruneColumnsInSchema(MessageType schema, List pruneColumns) {
if (pruneColumns == null || pruneColumns.isEmpty()) {
return schema;
} else {
List paths = new ArrayList<>();
getPaths(schema, paths, null);
for (String col : pruneColumns) {
if (!paths.contains(col)) {
LOG.warn("Input column name {} doesn't show up in the schema", col);
}
}
Set prunePaths = convertToColumnPaths(pruneColumns);
List fields = schema.getFields();
List currentPath = new ArrayList<>();
List prunedFields = pruneColumnsInFields(fields, currentPath, prunePaths);
return new MessageType(schema.getName(), prunedFields);
}
}
private List pruneColumnsInFields(List fields, List currentPath, Set prunePaths) {
List prunedFields = new ArrayList<>();
for (Type childField : fields) {
Type prunedChildField = pruneColumnsInField(childField, currentPath, prunePaths);
if (prunedChildField != null) {
prunedFields.add(prunedChildField);
}
}
return prunedFields;
}
private Type pruneColumnsInField(Type field, List currentPath, Set prunePaths) {
String fieldName = field.getName();
currentPath.add(fieldName);
ColumnPath path = ColumnPath.get(currentPath.toArray(new String[0]));
Type prunedField = null;
if (!prunePaths.contains(path)) {
if (field.isPrimitive()) {
prunedField = field;
} else {
List childFields = ((GroupType) field).getFields();
List prunedFields = pruneColumnsInFields(childFields, currentPath, prunePaths);
if (!prunedFields.isEmpty()) {
prunedField = ((GroupType) field).withNewFields(prunedFields);
}
}
}
currentPath.remove(currentPath.size() - 1);
return prunedField;
}
private Set convertToColumnPaths(List cols) {
Set prunePaths = new HashSet<>();
for (String col : cols) {
prunePaths.add(ColumnPath.fromDotString(col));
}
return prunePaths;
}
private void nullifyColumn(
TransParquetFileReader reader,
int blockIndex,
ColumnDescriptor descriptor,
ColumnChunkMetaData chunk,
ParquetFileWriter writer,
CompressionCodecName newCodecName,
boolean encryptColumn,
String originalCreatedBy)
throws IOException {
if (encryptColumn) {
Preconditions.checkArgument(writer.getEncryptor() != null, "Missing encryptor");
}
long totalChunkValues = chunk.getValueCount();
int dMax = descriptor.getMaxDefinitionLevel();
PageReadStore pageReadStore = reader.readRowGroup(blockIndex);
ColumnReadStoreImpl crStore =
new ColumnReadStoreImpl(pageReadStore, new DummyGroupConverter(), outSchema, originalCreatedBy);
ColumnReader cReader = crStore.getColumnReader(descriptor);
ParquetProperties.WriterVersion writerVersion = chunk.getEncodingStats().usesV2Pages()
? ParquetProperties.WriterVersion.PARQUET_2_0
: ParquetProperties.WriterVersion.PARQUET_1_0;
ParquetProperties props =
ParquetProperties.builder().withWriterVersion(writerVersion).build();
CodecFactory codecFactory = new CodecFactory(new Configuration(), props.getPageSizeThreshold());
CompressionCodecFactory.BytesInputCompressor compressor = codecFactory.getCompressor(newCodecName);
// Create new schema that only has the current column
MessageType newSchema = getSchemaWithRenamedColumns(newSchema(outSchema, descriptor));
ColumnChunkPageWriteStore cPageStore = new ColumnChunkPageWriteStore(
compressor,
newSchema,
props.getAllocator(),
props.getColumnIndexTruncateLength(),
props.getPageWriteChecksumEnabled(),
nullColumnEncryptor,
numBlocksRewritten);
ColumnWriteStore cStore = props.newColumnWriteStore(newSchema, cPageStore);
ColumnWriter cWriter = cStore.getColumnWriter(descriptor);
for (int i = 0; i < totalChunkValues; i++) {
int rlvl = cReader.getCurrentRepetitionLevel();
int dlvl = cReader.getCurrentDefinitionLevel();
if (dlvl == dMax) {
// since we checked ether optional or repeated, dlvl should be > 0
if (dlvl == 0) {
throw new IOException("definition level is detected to be 0 for column "
+ chunk.getPath().toDotString() + " to be nullified");
}
// we just write one null for the whole list at the top level,
// instead of nullify the elements in the list one by one
if (rlvl == 0) {
cWriter.writeNull(rlvl, dlvl - 1);
}
} else {
cWriter.writeNull(rlvl, dlvl);
}
cStore.endRecord();
}
pageReadStore.close();
cStore.flush();
cPageStore.flushToFileWriter(writer);
cStore.close();
cWriter.close();
}
private MessageType newSchema(MessageType schema, ColumnDescriptor descriptor) {
String[] path = descriptor.getPath();
Type type = schema.getType(path);
if (path.length == 1) {
return new MessageType(schema.getName(), type);
}
for (Type field : schema.getFields()) {
if (!field.isPrimitive()) {
Type newType = extractField(field.asGroupType(), type);
if (newType != null) {
return new MessageType(schema.getName(), newType);
}
}
}
// We should never hit this because 'type' is returned by schema.getType().
throw new RuntimeException("No field is found");
}
private Type extractField(GroupType candidate, Type targetField) {
if (targetField.equals(candidate)) {
return targetField;
}
// In case 'type' is a descendants of candidate
for (Type field : candidate.asGroupType().getFields()) {
if (field.isPrimitive()) {
if (field.equals(targetField)) {
return new GroupType(candidate.getRepetition(), candidate.getName(), targetField);
}
} else {
Type tempField = extractField(field.asGroupType(), targetField);
if (tempField != null) {
return new GroupType(candidate.getRepetition(), candidate.getName(), tempField);
}
}
}
return null;
}
private static final class DummyGroupConverter extends GroupConverter {
@Override
public void start() {}
@Override
public void end() {}
@Override
public Converter getConverter(int fieldIndex) {
return new DummyConverter();
}
}
private static final class DummyConverter extends PrimitiveConverter {
@Override
public GroupConverter asGroupConverter() {
return new DummyGroupConverter();
}
}
private static class ColumnChunkEncryptorRunTime {
private final InternalColumnEncryptionSetup colEncrSetup;
private final BlockCipher.Encryptor dataEncryptor;
private final BlockCipher.Encryptor metaDataEncryptor;
private final byte[] fileAAD;
private final byte[] dataPageHeaderAAD;
private final byte[] dataPageAAD;
private final byte[] dictPageHeaderAAD;
private final byte[] dictPageAAD;
public ColumnChunkEncryptorRunTime(
InternalFileEncryptor fileEncryptor, ColumnChunkMetaData chunk, int blockId, int columnId)
throws IOException {
Preconditions.checkArgument(
fileEncryptor != null, "FileEncryptor is required to create ColumnChunkEncryptorRunTime");
this.colEncrSetup = fileEncryptor.getColumnSetup(chunk.getPath(), true, columnId);
this.dataEncryptor = colEncrSetup.getDataEncryptor();
this.metaDataEncryptor = colEncrSetup.getMetaDataEncryptor();
this.fileAAD = fileEncryptor.getFileAAD();
if (colEncrSetup != null && colEncrSetup.isEncrypted()) {
this.dataPageHeaderAAD = createAAD(ModuleType.DataPageHeader, blockId, columnId);
this.dataPageAAD = createAAD(ModuleType.DataPage, blockId, columnId);
this.dictPageHeaderAAD = createAAD(ModuleType.DictionaryPageHeader, blockId, columnId);
this.dictPageAAD = createAAD(ModuleType.DictionaryPage, blockId, columnId);
} else {
this.dataPageHeaderAAD = null;
this.dataPageAAD = null;
this.dictPageHeaderAAD = null;
this.dictPageAAD = null;
}
}
private byte[] createAAD(ModuleType moduleType, int blockId, int columnId) {
return AesCipher.createModuleAAD(fileAAD, moduleType, blockId, columnId, 0);
}
public BlockCipher.Encryptor getDataEncryptor() {
return this.dataEncryptor;
}
public BlockCipher.Encryptor getMetaDataEncryptor() {
return this.metaDataEncryptor;
}
public byte[] getDataPageHeaderAAD() {
return this.dataPageHeaderAAD;
}
public byte[] getDataPageAAD() {
return this.dataPageAAD;
}
public byte[] getDictPageHeaderAAD() {
return this.dictPageHeaderAAD;
}
public byte[] getDictPageAAD() {
return this.dictPageAAD;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy