Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.parquet.hadoop.ParquetFileWriter Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.hadoop;
import static org.apache.parquet.format.Util.writeFileCryptoMetaData;
import static org.apache.parquet.format.Util.writeFileMetaData;
import static org.apache.parquet.format.converter.ParquetMetadataConverter.MAX_STATS_SIZE;
import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.zip.CRC32;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.Preconditions;
import org.apache.parquet.Version;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.bytes.BytesUtils;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.EncodingStats;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
import org.apache.parquet.crypto.AesCipher;
import org.apache.parquet.crypto.ColumnEncryptionProperties;
import org.apache.parquet.crypto.FileEncryptionProperties;
import org.apache.parquet.crypto.InternalColumnEncryptionSetup;
import org.apache.parquet.crypto.InternalFileEncryptor;
import org.apache.parquet.crypto.ModuleCipherFactory;
import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType;
import org.apache.parquet.crypto.ParquetCryptoRuntimeException;
import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.format.BlockCipher;
import org.apache.parquet.format.Util;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.StrictKeyValueMetadataMergeStrategy;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.hadoop.metadata.GlobalMetaData;
import org.apache.parquet.hadoop.metadata.KeyValueMetadataMergeStrategy;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.HadoopOutputFile;
import org.apache.parquet.hadoop.util.HadoopStreams;
import org.apache.parquet.internal.column.columnindex.ColumnIndex;
import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder;
import org.apache.parquet.internal.column.columnindex.OffsetIndex;
import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder;
import org.apache.parquet.internal.hadoop.metadata.IndexReference;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.io.OutputFile;
import org.apache.parquet.io.SeekableInputStream;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.io.PositionOutputStream;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.TypeUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Internal implementation of the Parquet file writer as a block container
*/
public class ParquetFileWriter {
private static final Logger LOG = LoggerFactory.getLogger(ParquetFileWriter.class);
private final ParquetMetadataConverter metadataConverter;
public static final String PARQUET_METADATA_FILE = "_metadata";
public static final String MAGIC_STR = "PAR1";
public static final byte[] MAGIC = MAGIC_STR.getBytes(StandardCharsets.US_ASCII);
public static final String EF_MAGIC_STR = "PARE";
public static final byte[] EFMAGIC = EF_MAGIC_STR.getBytes(StandardCharsets.US_ASCII);
public static final String PARQUET_COMMON_METADATA_FILE = "_common_metadata";
public static final int CURRENT_VERSION = 1;
// File creation modes
public static enum Mode {
CREATE,
OVERWRITE
}
protected final PositionOutputStream out;
private final MessageType schema;
private final AlignmentStrategy alignment;
private final int columnIndexTruncateLength;
// file data
private List blocks = new ArrayList();
// The column/offset indexes per blocks per column chunks
private final List> columnIndexes = new ArrayList<>();
private final List> offsetIndexes = new ArrayList<>();
// The Bloom filters
private final List> bloomFilters = new ArrayList<>();
// The file encryptor
private final InternalFileEncryptor fileEncryptor;
// row group data
private BlockMetaData currentBlock; // appended to by endColumn
// The column/offset indexes for the actual block
private List currentColumnIndexes;
private List currentOffsetIndexes;
// The Bloom filter for the actual block
private Map currentBloomFilters;
// row group data set at the start of a row group
private long currentRecordCount; // set in startBlock
// column chunk data accumulated as pages are written
private EncodingStats.Builder encodingStatsBuilder;
private Set currentEncodings;
private long uncompressedLength;
private long compressedLength;
private Statistics currentStatistics; // accumulated in writePage(s)
private ColumnIndexBuilder columnIndexBuilder;
private OffsetIndexBuilder offsetIndexBuilder;
// column chunk data set at the start of a column
private CompressionCodecName currentChunkCodec; // set in startColumn
private ColumnPath currentChunkPath; // set in startColumn
private PrimitiveType currentChunkType; // set in startColumn
private long currentChunkValueCount; // set in startColumn
private long currentChunkFirstDataPage; // set in startColumn & page writes
private long currentChunkDictionaryPageOffset; // set in writeDictionaryPage
// set when end is called
private ParquetMetadata footer = null;
private final CRC32 crc;
private boolean pageWriteChecksumEnabled;
/**
* Captures the order in which methods should be called
*/
private enum STATE {
NOT_STARTED {
STATE start() {
return STARTED;
}
},
STARTED {
STATE startBlock() {
return BLOCK;
}
STATE end() {
return ENDED;
}
},
BLOCK {
STATE startColumn() {
return COLUMN;
}
STATE endBlock() {
return STARTED;
}
},
COLUMN {
STATE endColumn() {
return BLOCK;
};
STATE write() {
return this;
}
},
ENDED;
STATE start() throws IOException { return error(); }
STATE startBlock() throws IOException { return error(); }
STATE startColumn() throws IOException { return error(); }
STATE write() throws IOException { return error(); }
STATE endColumn() throws IOException { return error(); }
STATE endBlock() throws IOException { return error(); }
STATE end() throws IOException { return error(); }
private final STATE error() throws IOException {
throw new IOException("The file being written is in an invalid state. Probably caused by an error thrown previously. Current state: " + this.name());
}
}
private STATE state = STATE.NOT_STARTED;
/**
* @param configuration Hadoop configuration
* @param schema the schema of the data
* @param file the file to write to
* @throws IOException if the file can not be created
* @deprecated will be removed in 2.0.0
*/
@Deprecated
public ParquetFileWriter(Configuration configuration, MessageType schema,
Path file) throws IOException {
this(HadoopOutputFile.fromPath(file, configuration),
schema, Mode.CREATE, DEFAULT_BLOCK_SIZE, MAX_PADDING_SIZE_DEFAULT);
}
/**
* @param configuration Hadoop configuration
* @param schema the schema of the data
* @param file the file to write to
* @param mode file creation mode
* @throws IOException if the file can not be created
* @deprecated will be removed in 2.0.0
*/
@Deprecated
public ParquetFileWriter(Configuration configuration, MessageType schema,
Path file, Mode mode) throws IOException {
this(HadoopOutputFile.fromPath(file, configuration),
schema, mode, DEFAULT_BLOCK_SIZE, MAX_PADDING_SIZE_DEFAULT);
}
/**
* @param configuration Hadoop configuration
* @param schema the schema of the data
* @param file the file to write to
* @param mode file creation mode
* @param rowGroupSize the row group size
* @param maxPaddingSize the maximum padding
* @throws IOException if the file can not be created
* @deprecated will be removed in 2.0.0
*/
@Deprecated
public ParquetFileWriter(Configuration configuration, MessageType schema,
Path file, Mode mode, long rowGroupSize,
int maxPaddingSize)
throws IOException {
this(HadoopOutputFile.fromPath(file, configuration),
schema, mode, rowGroupSize, maxPaddingSize);
}
/**
* @param file OutputFile to create or overwrite
* @param schema the schema of the data
* @param mode file creation mode
* @param rowGroupSize the row group size
* @param maxPaddingSize the maximum padding
* @throws IOException if the file can not be created
* @deprecated will be removed in 2.0.0
*/
@Deprecated
public ParquetFileWriter(OutputFile file, MessageType schema, Mode mode,
long rowGroupSize, int maxPaddingSize)
throws IOException {
this(file, schema, mode, rowGroupSize, maxPaddingSize,
ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH,
ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED);
}
/**
* @param file OutputFile to create or overwrite
* @param schema the schema of the data
* @param mode file creation mode
* @param rowGroupSize the row group size
* @param maxPaddingSize the maximum padding
* @param columnIndexTruncateLength the length which the min/max values in column indexes tried to be truncated to
* @param statisticsTruncateLength the length which the min/max values in row groups tried to be truncated to
* @param pageWriteChecksumEnabled whether to write out page level checksums
* @throws IOException if the file can not be created
*/
public ParquetFileWriter(OutputFile file, MessageType schema, Mode mode,
long rowGroupSize, int maxPaddingSize, int columnIndexTruncateLength,
int statisticsTruncateLength, boolean pageWriteChecksumEnabled)
throws IOException{
this(file, schema, mode, rowGroupSize, maxPaddingSize, columnIndexTruncateLength,
statisticsTruncateLength, pageWriteChecksumEnabled, null);
}
public ParquetFileWriter(OutputFile file, MessageType schema, Mode mode,
long rowGroupSize, int maxPaddingSize, int columnIndexTruncateLength,
int statisticsTruncateLength, boolean pageWriteChecksumEnabled,
FileEncryptionProperties encryptionProperties)
throws IOException {
TypeUtil.checkValidWriteSchema(schema);
this.schema = schema;
long blockSize = rowGroupSize;
if (file.supportsBlockSize()) {
blockSize = Math.max(file.defaultBlockSize(), rowGroupSize);
this.alignment = PaddingAlignment.get(blockSize, rowGroupSize, maxPaddingSize);
} else {
this.alignment = NoAlignment.get(rowGroupSize);
}
if (mode == Mode.OVERWRITE) {
this.out = file.createOrOverwrite(blockSize);
} else {
this.out = file.create(blockSize);
}
this.encodingStatsBuilder = new EncodingStats.Builder();
this.columnIndexTruncateLength = columnIndexTruncateLength;
this.pageWriteChecksumEnabled = pageWriteChecksumEnabled;
this.crc = pageWriteChecksumEnabled ? new CRC32() : null;
this.metadataConverter = new ParquetMetadataConverter(statisticsTruncateLength);
if (null == encryptionProperties) {
this.fileEncryptor = null;
} else {
// Verify that every encrypted column is in file schema
Map columnEncryptionProperties = encryptionProperties.getEncryptedColumns();
if (null != columnEncryptionProperties) { // if null, every column in file schema will be encrypted with footer key
for (Map.Entry entry : columnEncryptionProperties.entrySet()) {
String[] path = entry.getKey().toArray();
if(!schema.containsPath(path)) {
throw new ParquetCryptoRuntimeException("Encrypted column " + Arrays.toString(path) + " not in file schema");
}
}
}
this.fileEncryptor = new InternalFileEncryptor(encryptionProperties);
}
}
/**
* FOR TESTING ONLY. This supports testing block padding behavior on the local FS.
*
* @param configuration Hadoop configuration
* @param schema the schema of the data
* @param file the file to write to
* @param rowAndBlockSize the row group size
* @param maxPaddingSize the maximum padding
* @throws IOException if the file can not be created
*/
ParquetFileWriter(Configuration configuration, MessageType schema,
Path file, long rowAndBlockSize, int maxPaddingSize)
throws IOException {
FileSystem fs = file.getFileSystem(configuration);
this.schema = schema;
this.alignment = PaddingAlignment.get(
rowAndBlockSize, rowAndBlockSize, maxPaddingSize);
this.out = HadoopStreams.wrap(
fs.create(file, true, 8192, fs.getDefaultReplication(file), rowAndBlockSize));
this.encodingStatsBuilder = new EncodingStats.Builder();
// no truncation is needed for testing
this.columnIndexTruncateLength = Integer.MAX_VALUE;
this.pageWriteChecksumEnabled = ParquetOutputFormat.getPageWriteChecksumEnabled(configuration);
this.crc = pageWriteChecksumEnabled ? new CRC32() : null;
this.metadataConverter = new ParquetMetadataConverter(ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH);
this.fileEncryptor = null;
}
/**
* start the file
* @throws IOException if there is an error while writing
*/
public void start() throws IOException {
state = state.start();
LOG.debug("{}: start", out.getPos());
byte[] magic = MAGIC;
if (null != fileEncryptor && fileEncryptor.isFooterEncrypted()) {
magic = EFMAGIC;
}
out.write(magic);
}
InternalFileEncryptor getEncryptor() {
return fileEncryptor;
}
/**
* start a block
* @param recordCount the record count in this block
* @throws IOException if there is an error while writing
*/
public void startBlock(long recordCount) throws IOException {
state = state.startBlock();
LOG.debug("{}: start block", out.getPos());
// out.write(MAGIC); // TODO: add a magic delimiter
alignment.alignForRowGroup(out);
currentBlock = new BlockMetaData();
currentRecordCount = recordCount;
currentColumnIndexes = new ArrayList<>();
currentOffsetIndexes = new ArrayList<>();
currentBloomFilters = new HashMap<>();
}
/**
* start a column inside a block
* @param descriptor the column descriptor
* @param valueCount the value count in this column
* @param compressionCodecName a compression codec name
* @throws IOException if there is an error while writing
*/
public void startColumn(ColumnDescriptor descriptor,
long valueCount,
CompressionCodecName compressionCodecName) throws IOException {
state = state.startColumn();
encodingStatsBuilder.clear();
currentEncodings = new HashSet();
currentChunkPath = ColumnPath.get(descriptor.getPath());
currentChunkType = descriptor.getPrimitiveType();
currentChunkCodec = compressionCodecName;
currentChunkValueCount = valueCount;
currentChunkFirstDataPage = -1;
compressedLength = 0;
uncompressedLength = 0;
// The statistics will be copied from the first one added at writeDataPage(s) so we have the correct typed one
currentStatistics = null;
columnIndexBuilder = ColumnIndexBuilder.getBuilder(currentChunkType, columnIndexTruncateLength);
offsetIndexBuilder = OffsetIndexBuilder.getBuilder();
}
/**
* writes a dictionary page page
* @param dictionaryPage the dictionary page
* @throws IOException if there is an error while writing
*/
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
writeDictionaryPage(dictionaryPage, null, null);
}
public void writeDictionaryPage(DictionaryPage dictionaryPage,
BlockCipher.Encryptor headerBlockEncryptor, byte[] AAD) throws IOException {
state = state.write();
LOG.debug("{}: write dictionary page: {} values", out.getPos(), dictionaryPage.getDictionarySize());
currentChunkDictionaryPageOffset = out.getPos();
int uncompressedSize = dictionaryPage.getUncompressedSize();
int compressedPageSize = (int)dictionaryPage.getBytes().size(); // TODO: fix casts
if (pageWriteChecksumEnabled) {
crc.reset();
crc.update(dictionaryPage.getBytes().toByteArray());
metadataConverter.writeDictionaryPageHeader(
uncompressedSize,
compressedPageSize,
dictionaryPage.getDictionarySize(),
dictionaryPage.getEncoding(),
(int) crc.getValue(),
out,
headerBlockEncryptor,
AAD);
} else {
metadataConverter.writeDictionaryPageHeader(
uncompressedSize,
compressedPageSize,
dictionaryPage.getDictionarySize(),
dictionaryPage.getEncoding(),
out,
headerBlockEncryptor,
AAD);
}
long headerSize = out.getPos() - currentChunkDictionaryPageOffset;
this.uncompressedLength += uncompressedSize + headerSize;
this.compressedLength += compressedPageSize + headerSize;
LOG.debug("{}: write dictionary page content {}", out.getPos(), compressedPageSize);
dictionaryPage.getBytes().writeAllTo(out); // for encrypted column, dictionary page bytes are already encrypted
encodingStatsBuilder.addDictEncoding(dictionaryPage.getEncoding());
currentEncodings.add(dictionaryPage.getEncoding());
}
/**
* writes a single page
* @param valueCount count of values
* @param uncompressedPageSize the size of the data once uncompressed
* @param bytes the compressed data for the page without header
* @param rlEncoding encoding of the repetition level
* @param dlEncoding encoding of the definition level
* @param valuesEncoding encoding of values
* @throws IOException if there is an error while writing
*/
@Deprecated
public void writeDataPage(
int valueCount, int uncompressedPageSize,
BytesInput bytes,
Encoding rlEncoding,
Encoding dlEncoding,
Encoding valuesEncoding) throws IOException {
state = state.write();
// We are unable to build indexes without rowCount so skip them for this column
offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder();
columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder();
long beforeHeader = out.getPos();
LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
int compressedPageSize = (int)bytes.size();
metadataConverter.writeDataPageV1Header(
uncompressedPageSize, compressedPageSize,
valueCount,
rlEncoding,
dlEncoding,
valuesEncoding,
out);
long headerSize = out.getPos() - beforeHeader;
this.uncompressedLength += uncompressedPageSize + headerSize;
this.compressedLength += compressedPageSize + headerSize;
LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize);
bytes.writeAllTo(out);
encodingStatsBuilder.addDataEncoding(valuesEncoding);
currentEncodings.add(rlEncoding);
currentEncodings.add(dlEncoding);
currentEncodings.add(valuesEncoding);
if (currentChunkFirstDataPage < 0) {
currentChunkFirstDataPage = beforeHeader;
}
}
/**
* writes a single page
* @param valueCount count of values
* @param uncompressedPageSize the size of the data once uncompressed
* @param bytes the compressed data for the page without header
* @param statistics statistics for the page
* @param rlEncoding encoding of the repetition level
* @param dlEncoding encoding of the definition level
* @param valuesEncoding encoding of values
* @throws IOException if there is an error while writing
* @deprecated this method does not support writing column indexes; Use
* {@link #writeDataPage(int, int, BytesInput, Statistics, long, Encoding, Encoding, Encoding)} instead
*/
@Deprecated
public void writeDataPage(
int valueCount, int uncompressedPageSize,
BytesInput bytes,
Statistics statistics,
Encoding rlEncoding,
Encoding dlEncoding,
Encoding valuesEncoding) throws IOException {
// We are unable to build indexes without rowCount so skip them for this column
offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder();
columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder();
innerWriteDataPage(valueCount, uncompressedPageSize, bytes, statistics, rlEncoding, dlEncoding, valuesEncoding);
}
/**
* Writes a single page
* @param valueCount count of values
* @param uncompressedPageSize the size of the data once uncompressed
* @param bytes the compressed data for the page without header
* @param statistics the statistics of the page
* @param rowCount the number of rows in the page
* @param rlEncoding encoding of the repetition level
* @param dlEncoding encoding of the definition level
* @param valuesEncoding encoding of values
* @throws IOException if any I/O error occurs during writing the file
*/
public void writeDataPage(
int valueCount, int uncompressedPageSize,
BytesInput bytes,
Statistics statistics,
long rowCount,
Encoding rlEncoding,
Encoding dlEncoding,
Encoding valuesEncoding) throws IOException {
long beforeHeader = out.getPos();
innerWriteDataPage(valueCount, uncompressedPageSize, bytes, statistics, rlEncoding, dlEncoding, valuesEncoding);
offsetIndexBuilder.add((int) (out.getPos() - beforeHeader), rowCount);
}
private void innerWriteDataPage(
int valueCount, int uncompressedPageSize,
BytesInput bytes,
Statistics statistics,
Encoding rlEncoding,
Encoding dlEncoding,
Encoding valuesEncoding) throws IOException {
state = state.write();
long beforeHeader = out.getPos();
if (currentChunkFirstDataPage < 0) {
currentChunkFirstDataPage = beforeHeader;
}
LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
int compressedPageSize = (int) bytes.size();
if (pageWriteChecksumEnabled) {
crc.reset();
crc.update(bytes.toByteArray());
metadataConverter.writeDataPageV1Header(
uncompressedPageSize, compressedPageSize,
valueCount,
rlEncoding,
dlEncoding,
valuesEncoding,
(int) crc.getValue(),
out);
} else {
metadataConverter.writeDataPageV1Header(
uncompressedPageSize, compressedPageSize,
valueCount,
rlEncoding,
dlEncoding,
valuesEncoding,
out);
}
long headerSize = out.getPos() - beforeHeader;
this.uncompressedLength += uncompressedPageSize + headerSize;
this.compressedLength += compressedPageSize + headerSize;
LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize);
bytes.writeAllTo(out);
// Copying the statistics if it is not initialized yet so we have the correct typed one
if (currentStatistics == null) {
currentStatistics = statistics.copy();
} else {
currentStatistics.mergeStatistics(statistics);
}
columnIndexBuilder.add(statistics);
encodingStatsBuilder.addDataEncoding(valuesEncoding);
currentEncodings.add(rlEncoding);
currentEncodings.add(dlEncoding);
currentEncodings.add(valuesEncoding);
}
/**
* Add a Bloom filter that will be written out. This is only used in unit test.
*
* @param column the column name
* @param bloomFilter the bloom filter of column values
*/
void addBloomFilter(String column, BloomFilter bloomFilter) {
currentBloomFilters.put(column , bloomFilter);
}
/**
* Writes a single v2 data page
* @param rowCount count of rows
* @param nullCount count of nulls
* @param valueCount count of values
* @param repetitionLevels repetition level bytes
* @param definitionLevels definition level bytes
* @param dataEncoding encoding for data
* @param compressedData compressed data bytes
* @param uncompressedDataSize the size of uncompressed data
* @param statistics the statistics of the page
* @throws IOException if any I/O error occurs during writing the file
*/
public void writeDataPageV2(int rowCount, int nullCount, int valueCount,
BytesInput repetitionLevels,
BytesInput definitionLevels,
Encoding dataEncoding,
BytesInput compressedData,
int uncompressedDataSize,
Statistics> statistics) throws IOException {
state = state.write();
int rlByteLength = toIntWithCheck(repetitionLevels.size());
int dlByteLength = toIntWithCheck(definitionLevels.size());
int compressedSize = toIntWithCheck(
compressedData.size() + repetitionLevels.size() + definitionLevels.size()
);
int uncompressedSize = toIntWithCheck(
uncompressedDataSize + repetitionLevels.size() + definitionLevels.size()
);
long beforeHeader = out.getPos();
if (currentChunkFirstDataPage < 0) {
currentChunkFirstDataPage = beforeHeader;
}
metadataConverter.writeDataPageV2Header(
uncompressedSize, compressedSize,
valueCount, nullCount, rowCount,
dataEncoding,
rlByteLength,
dlByteLength,
out);
long headersSize = out.getPos() - beforeHeader;
this.uncompressedLength += uncompressedSize + headersSize;
this.compressedLength += compressedSize + headersSize;
if (currentStatistics == null) {
currentStatistics = statistics.copy();
} else {
currentStatistics.mergeStatistics(statistics);
}
columnIndexBuilder.add(statistics);
currentEncodings.add(dataEncoding);
encodingStatsBuilder.addDataEncoding(dataEncoding);
BytesInput.concat(repetitionLevels, definitionLevels, compressedData)
.writeAllTo(out);
offsetIndexBuilder.add((int) (out.getPos() - beforeHeader), rowCount);
}
/**
* Writes a column chunk at once
* @param descriptor the descriptor of the column
* @param valueCount the value count in this column
* @param compressionCodecName the name of the compression codec used for compressing the pages
* @param dictionaryPage the dictionary page for this column chunk (might be null)
* @param bytes the encoded pages including page headers to be written as is
* @param uncompressedTotalPageSize total uncompressed size (without page headers)
* @param compressedTotalPageSize total compressed size (without page headers)
* @param totalStats accumulated statistics for the column chunk
* @param columnIndexBuilder the builder object for the column index
* @param offsetIndexBuilder the builder object for the offset index
* @param bloomFilter the bloom filter for this column
* @param rlEncodings the RL encodings used in this column chunk
* @param dlEncodings the DL encodings used in this column chunk
* @param dataEncodings the data encodings used in this column chunk
* @throws IOException if there is an error while writing
*/
void writeColumnChunk(ColumnDescriptor descriptor,
long valueCount,
CompressionCodecName compressionCodecName,
DictionaryPage dictionaryPage,
BytesInput bytes,
long uncompressedTotalPageSize,
long compressedTotalPageSize,
Statistics> totalStats,
ColumnIndexBuilder columnIndexBuilder,
OffsetIndexBuilder offsetIndexBuilder,
BloomFilter bloomFilter,
Set rlEncodings,
Set dlEncodings,
List dataEncodings) throws IOException {
writeColumnChunk(descriptor, valueCount, compressionCodecName, dictionaryPage, bytes,
uncompressedTotalPageSize, compressedTotalPageSize, totalStats, columnIndexBuilder, offsetIndexBuilder,
bloomFilter, rlEncodings, dlEncodings, dataEncodings, null, 0, 0, null);
}
void writeColumnChunk(ColumnDescriptor descriptor,
long valueCount,
CompressionCodecName compressionCodecName,
DictionaryPage dictionaryPage,
BytesInput bytes,
long uncompressedTotalPageSize,
long compressedTotalPageSize,
Statistics> totalStats,
ColumnIndexBuilder columnIndexBuilder,
OffsetIndexBuilder offsetIndexBuilder,
BloomFilter bloomFilter,
Set rlEncodings,
Set dlEncodings,
List dataEncodings,
BlockCipher.Encryptor headerBlockEncryptor,
int rowGroupOrdinal,
int columnOrdinal,
byte[] fileAAD) throws IOException {
startColumn(descriptor, valueCount, compressionCodecName);
state = state.write();
if (dictionaryPage != null) {
byte[] dictonaryPageHeaderAAD = null;
if (null != headerBlockEncryptor) {
dictonaryPageHeaderAAD = AesCipher.createModuleAAD(fileAAD, ModuleType.DictionaryPageHeader,
rowGroupOrdinal, columnOrdinal, -1);
}
writeDictionaryPage(dictionaryPage, headerBlockEncryptor, dictonaryPageHeaderAAD);
}
if (bloomFilter != null) {
// write bloom filter if one of data pages is not dictionary encoded
boolean isWriteBloomFilter = false;
for (Encoding encoding : dataEncodings) {
if (encoding != Encoding.RLE_DICTIONARY) {
isWriteBloomFilter = true;
break;
}
}
if (isWriteBloomFilter) {
currentBloomFilters.put(String.join(".", descriptor.getPath()), bloomFilter);
}
}
LOG.debug("{}: write data pages", out.getPos());
long headersSize = bytes.size() - compressedTotalPageSize;
this.uncompressedLength += uncompressedTotalPageSize + headersSize;
this.compressedLength += compressedTotalPageSize + headersSize;
LOG.debug("{}: write data pages content", out.getPos());
currentChunkFirstDataPage = out.getPos();
bytes.writeAllTo(out);
encodingStatsBuilder.addDataEncodings(dataEncodings);
if (rlEncodings.isEmpty()) {
encodingStatsBuilder.withV2Pages();
}
currentEncodings.addAll(rlEncodings);
currentEncodings.addAll(dlEncodings);
currentEncodings.addAll(dataEncodings);
currentStatistics = totalStats;
this.columnIndexBuilder = columnIndexBuilder;
this.offsetIndexBuilder = offsetIndexBuilder;
endColumn();
}
/**
* end a column (once all rep, def and data have been written)
* @throws IOException if there is an error while writing
*/
public void endColumn() throws IOException {
state = state.endColumn();
LOG.debug("{}: end column", out.getPos());
if (columnIndexBuilder.getMinMaxSize() > columnIndexBuilder.getPageCount() * MAX_STATS_SIZE) {
currentColumnIndexes.add(null);
} else {
currentColumnIndexes.add(columnIndexBuilder.build());
}
currentOffsetIndexes.add(offsetIndexBuilder.build(currentChunkFirstDataPage));
currentBlock.addColumn(ColumnChunkMetaData.get(
currentChunkPath,
currentChunkType,
currentChunkCodec,
encodingStatsBuilder.build(),
currentEncodings,
currentStatistics,
currentChunkFirstDataPage,
currentChunkDictionaryPageOffset,
currentChunkValueCount,
compressedLength,
uncompressedLength));
this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength);
this.uncompressedLength = 0;
this.compressedLength = 0;
this.currentChunkDictionaryPageOffset = 0;
columnIndexBuilder = null;
offsetIndexBuilder = null;
}
/**
* ends a block once all column chunks have been written
* @throws IOException if there is an error while writing
*/
public void endBlock() throws IOException {
if (currentRecordCount == 0) {
throw new ParquetEncodingException("End block with zero record");
}
state = state.endBlock();
LOG.debug("{}: end block", out.getPos());
currentBlock.setRowCount(currentRecordCount);
currentBlock.setOrdinal(blocks.size());
blocks.add(currentBlock);
columnIndexes.add(currentColumnIndexes);
offsetIndexes.add(currentOffsetIndexes);
bloomFilters.add(currentBloomFilters);
currentColumnIndexes = null;
currentOffsetIndexes = null;
currentBloomFilters = null;
currentBlock = null;
}
/**
* @param conf a configuration
* @param file a file path to append the contents of to this file
* @throws IOException if there is an error while reading or writing
* @deprecated will be removed in 2.0.0; use {@link #appendFile(InputFile)} instead
*/
@Deprecated
public void appendFile(Configuration conf, Path file) throws IOException {
try (ParquetFileReader reader = ParquetFileReader.open(conf, file)) {
reader.appendTo(this);
}
}
public void appendFile(InputFile file) throws IOException {
try (ParquetFileReader reader = ParquetFileReader.open(file)) {
reader.appendTo(this);
}
}
/**
* @param file a file stream to read from
* @param rowGroups row groups to copy
* @param dropColumns whether to drop columns from the file that are not in this file's schema
* @throws IOException if there is an error while reading or writing
* @deprecated will be removed in 2.0.0;
* use {@link #appendRowGroups(SeekableInputStream,List,boolean)} instead
*/
@Deprecated
public void appendRowGroups(FSDataInputStream file,
List rowGroups,
boolean dropColumns) throws IOException {
appendRowGroups(HadoopStreams.wrap(file), rowGroups, dropColumns);
}
public void appendRowGroups(SeekableInputStream file,
List rowGroups,
boolean dropColumns) throws IOException {
for (BlockMetaData block : rowGroups) {
appendRowGroup(file, block, dropColumns);
}
}
/**
* @param from a file stream to read from
* @param rowGroup row group to copy
* @param dropColumns whether to drop columns from the file that are not in this file's schema
* @throws IOException if there is an error while reading or writing
* @deprecated will be removed in 2.0.0;
* use {@link #appendRowGroup(SeekableInputStream,BlockMetaData,boolean)} instead
*/
@Deprecated
public void appendRowGroup(FSDataInputStream from, BlockMetaData rowGroup,
boolean dropColumns) throws IOException {
appendRowGroup(HadoopStreams.wrap(from), rowGroup, dropColumns);
}
public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup,
boolean dropColumns) throws IOException {
startBlock(rowGroup.getRowCount());
Map columnsToCopy =
new HashMap();
for (ColumnChunkMetaData chunk : rowGroup.getColumns()) {
columnsToCopy.put(chunk.getPath().toDotString(), chunk);
}
List columnsInOrder =
new ArrayList();
for (ColumnDescriptor descriptor : schema.getColumns()) {
String path = ColumnPath.get(descriptor.getPath()).toDotString();
ColumnChunkMetaData chunk = columnsToCopy.remove(path);
if (chunk != null) {
columnsInOrder.add(chunk);
} else {
throw new IllegalArgumentException(String.format(
"Missing column '%s', cannot copy row group: %s", path, rowGroup));
}
}
// complain if some columns would be dropped and that's not okay
if (!dropColumns && !columnsToCopy.isEmpty()) {
throw new IllegalArgumentException(String.format(
"Columns cannot be copied (missing from target schema): %s",
String.join(", ", columnsToCopy.keySet())));
}
// copy the data for all chunks
long start = -1;
long length = 0;
long blockUncompressedSize = 0L;
for (int i = 0; i < columnsInOrder.size(); i += 1) {
ColumnChunkMetaData chunk = columnsInOrder.get(i);
// get this chunk's start position in the new file
long newChunkStart = out.getPos() + length;
// add this chunk to be copied with any previous chunks
if (start < 0) {
// no previous chunk included, start at this chunk's starting pos
start = chunk.getStartingPos();
}
length += chunk.getTotalSize();
if ((i + 1) == columnsInOrder.size() ||
columnsInOrder.get(i + 1).getStartingPos() != (start + length)) {
// not contiguous. do the copy now.
copy(from, out, start, length);
// reset to start at the next column chunk
start = -1;
length = 0;
}
// TODO: column/offset indexes are not copied
// (it would require seeking to the end of the file for each row groups)
currentColumnIndexes.add(null);
currentOffsetIndexes.add(null);
Offsets offsets = Offsets.getOffsets(from, chunk, newChunkStart);
currentBlock.addColumn(ColumnChunkMetaData.get(
chunk.getPath(),
chunk.getPrimitiveType(),
chunk.getCodec(),
chunk.getEncodingStats(),
chunk.getEncodings(),
chunk.getStatistics(),
offsets.firstDataPageOffset,
offsets.dictionaryPageOffset,
chunk.getValueCount(),
chunk.getTotalSize(),
chunk.getTotalUncompressedSize()));
blockUncompressedSize += chunk.getTotalUncompressedSize();
}
currentBlock.setTotalByteSize(blockUncompressedSize);
endBlock();
}
/**
* @param descriptor the descriptor for the target column
* @param from a file stream to read from
* @param chunk the column chunk to be copied
* @param bloomFilter the bloomFilter for this chunk
* @param columnIndex the column index for this chunk
* @param offsetIndex the offset index for this chunk
* @throws IOException
*/
public void appendColumnChunk(ColumnDescriptor descriptor, SeekableInputStream from, ColumnChunkMetaData chunk,
BloomFilter bloomFilter, ColumnIndex columnIndex, OffsetIndex offsetIndex) throws IOException {
long start = chunk.getStartingPos();
long length = chunk.getTotalSize();
long newChunkStart = out.getPos();
copy(from, out, start, length);
currentBloomFilters.put(String.join(".", descriptor.getPath()), bloomFilter);
currentColumnIndexes.add(columnIndex);
currentOffsetIndexes.add(offsetIndex);
Offsets offsets = Offsets.getOffsets(from, chunk, newChunkStart);
currentBlock.addColumn(ColumnChunkMetaData.get(
chunk.getPath(),
chunk.getPrimitiveType(),
chunk.getCodec(),
chunk.getEncodingStats(),
chunk.getEncodings(),
chunk.getStatistics(),
offsets.firstDataPageOffset,
offsets.dictionaryPageOffset,
chunk.getValueCount(),
chunk.getTotalSize(),
chunk.getTotalUncompressedSize()));
currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + chunk.getTotalUncompressedSize());
}
// Buffers for the copy function.
private static final ThreadLocal COPY_BUFFER = ThreadLocal.withInitial(() -> new byte[8192]);
/**
* Copy from a FS input stream to an output stream. Thread-safe
*
* @param from a {@link SeekableInputStream}
* @param to any {@link PositionOutputStream}
* @param start where in the from stream to start copying
* @param length the number of bytes to copy
* @throws IOException if there is an error while reading or writing
*/
private static void copy(SeekableInputStream from, PositionOutputStream to,
long start, long length) throws IOException{
LOG.debug("Copying {} bytes at {} to {}" ,length , start , to.getPos());
from.seek(start);
long bytesCopied = 0;
byte[] buffer = COPY_BUFFER.get();
while (bytesCopied < length) {
long bytesLeft = length - bytesCopied;
int bytesRead = from.read(buffer, 0,
(buffer.length < bytesLeft ? buffer.length : (int) bytesLeft));
if (bytesRead < 0) {
throw new IllegalArgumentException(
"Unexpected end of input file at " + start + bytesCopied);
}
to.write(buffer, 0, bytesRead);
bytesCopied += bytesRead;
}
}
/**
* ends a file once all blocks have been written.
* closes the file.
* @param extraMetaData the extra meta data to write in the footer
* @throws IOException if there is an error while writing
*/
public void end(Map extraMetaData) throws IOException {
state = state.end();
serializeColumnIndexes(columnIndexes, blocks, out, fileEncryptor);
serializeOffsetIndexes(offsetIndexes, blocks, out, fileEncryptor);
serializeBloomFilters(bloomFilters, blocks, out, fileEncryptor);
LOG.debug("{}: end", out.getPos());
this.footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks);
serializeFooter(footer, out, fileEncryptor);
out.close();
}
private static void serializeColumnIndexes(
List> columnIndexes,
List blocks,
PositionOutputStream out,
InternalFileEncryptor fileEncryptor) throws IOException {
LOG.debug("{}: column indexes", out.getPos());
for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
BlockMetaData block = blocks.get(bIndex);
List columns = block.getColumns();
List blockColumnIndexes = columnIndexes.get(bIndex);
for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
ColumnChunkMetaData column = columns.get(cIndex);
org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter
.toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex));
if (columnIndex == null) {
continue;
}
BlockCipher.Encryptor columnIndexEncryptor = null;
byte[] columnIndexAAD = null;
if (null != fileEncryptor) {
InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
if (columnEncryptionSetup.isEncrypted()) {
columnIndexEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
columnIndexAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.ColumnIndex,
block.getOrdinal(), columnEncryptionSetup.getOrdinal(), -1);
}
}
long offset = out.getPos();
Util.writeColumnIndex(columnIndex, out, columnIndexEncryptor, columnIndexAAD);
column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
}
}
}
private int toIntWithCheck(long size) {
if ((int)size != size) {
throw new ParquetEncodingException("Cannot write page larger than " + Integer.MAX_VALUE + " bytes: " + size);
}
return (int)size;
}
private static void serializeOffsetIndexes(
List> offsetIndexes,
List blocks,
PositionOutputStream out,
InternalFileEncryptor fileEncryptor) throws IOException {
LOG.debug("{}: offset indexes", out.getPos());
for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
BlockMetaData block = blocks.get(bIndex);
List columns = block.getColumns();
List blockOffsetIndexes = offsetIndexes.get(bIndex);
for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex);
if (offsetIndex == null) {
continue;
}
ColumnChunkMetaData column = columns.get(cIndex);
BlockCipher.Encryptor offsetIndexEncryptor = null;
byte[] offsetIndexAAD = null;
if (null != fileEncryptor) {
InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
if (columnEncryptionSetup.isEncrypted()) {
offsetIndexEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
offsetIndexAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.OffsetIndex,
block.getOrdinal(), columnEncryptionSetup.getOrdinal(), -1);
}
}
long offset = out.getPos();
Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out, offsetIndexEncryptor, offsetIndexAAD);
column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
}
}
}
private static void serializeBloomFilters(
List> bloomFilters,
List blocks,
PositionOutputStream out,
InternalFileEncryptor fileEncryptor) throws IOException {
LOG.debug("{}: bloom filters", out.getPos());
for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
BlockMetaData block = blocks.get(bIndex);
List columns = block.getColumns();
Map blockBloomFilters = bloomFilters.get(bIndex);
if (blockBloomFilters.isEmpty()) continue;
for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
ColumnChunkMetaData column = columns.get(cIndex);
BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
if (bloomFilter == null) {
continue;
}
long offset = out.getPos();
column.setBloomFilterOffset(offset);
BlockCipher.Encryptor bloomFilterEncryptor = null;
byte[] bloomFilterHeaderAAD = null;
byte[] bloomFilterBitsetAAD = null;
if (null != fileEncryptor) {
InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
if (columnEncryptionSetup.isEncrypted()) {
bloomFilterEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
int columnOrdinal = columnEncryptionSetup.getOrdinal();
bloomFilterHeaderAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.BloomFilterHeader,
block.getOrdinal(), columnOrdinal, -1);
bloomFilterBitsetAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.BloomFilterBitset,
block.getOrdinal(), columnOrdinal, -1);
}
}
Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out,
bloomFilterEncryptor, bloomFilterHeaderAAD);
ByteArrayOutputStream tempOutStream = new ByteArrayOutputStream();
bloomFilter.writeTo(tempOutStream);
byte[] serializedBitset = tempOutStream.toByteArray();
if (null != bloomFilterEncryptor) {
serializedBitset = bloomFilterEncryptor.encrypt(serializedBitset, bloomFilterBitsetAAD);
}
out.write(serializedBitset);
}
}
}
private static void serializeFooter(ParquetMetadata footer, PositionOutputStream out,
InternalFileEncryptor fileEncryptor) throws IOException {
ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter();
// Unencrypted file
if (null == fileEncryptor) {
long footerIndex = out.getPos();
org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(CURRENT_VERSION, footer);
writeFileMetaData(parquetMetadata, out);
LOG.debug("{}: footer length = {}" , out.getPos(), (out.getPos() - footerIndex));
BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex));
out.write(MAGIC);
return;
}
org.apache.parquet.format.FileMetaData parquetMetadata =
metadataConverter.toParquetMetadata(CURRENT_VERSION, footer, fileEncryptor);
// Encrypted file with plaintext footer
if (!fileEncryptor.isFooterEncrypted()) {
long footerIndex = out.getPos();
parquetMetadata.setEncryption_algorithm(fileEncryptor.getEncryptionAlgorithm());
// create footer signature (nonce + tag of encrypted footer)
byte[] footerSigningKeyMetaData = fileEncryptor.getFooterSigningKeyMetaData();
if (null != footerSigningKeyMetaData) {
parquetMetadata.setFooter_signing_key_metadata(footerSigningKeyMetaData);
}
ByteArrayOutputStream tempOutStream = new ByteArrayOutputStream();
writeFileMetaData(parquetMetadata, tempOutStream);
byte[] serializedFooter = tempOutStream.toByteArray();
byte[] footerAAD = AesCipher.createFooterAAD(fileEncryptor.getFileAAD());
byte[] encryptedFooter = fileEncryptor.getSignedFooterEncryptor().encrypt(serializedFooter, footerAAD);
byte[] signature = new byte[AesCipher.NONCE_LENGTH + AesCipher.GCM_TAG_LENGTH];
System.arraycopy(encryptedFooter, ModuleCipherFactory.SIZE_LENGTH, signature, 0, AesCipher.NONCE_LENGTH); // copy Nonce
System.arraycopy(encryptedFooter, encryptedFooter.length - AesCipher.GCM_TAG_LENGTH,
signature, AesCipher.NONCE_LENGTH, AesCipher.GCM_TAG_LENGTH); // copy GCM Tag
out.write(serializedFooter);
out.write(signature);
LOG.debug("{}: footer and signature length = {}" , out.getPos(), (out.getPos() - footerIndex));
BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex));
out.write(MAGIC);
return;
}
// Encrypted file with encrypted footer
long cryptoFooterIndex = out.getPos();
writeFileCryptoMetaData(fileEncryptor.getFileCryptoMetaData(), out);
byte[] footerAAD = AesCipher.createFooterAAD(fileEncryptor.getFileAAD());
writeFileMetaData(parquetMetadata, out, fileEncryptor.getFooterEncryptor(), footerAAD);
int combinedMetaDataLength = (int)(out.getPos() - cryptoFooterIndex);
LOG.debug("{}: crypto metadata and footer length = {}" , out.getPos(), combinedMetaDataLength);
BytesUtils.writeIntLittleEndian(out, combinedMetaDataLength);
out.write(EFMAGIC);
}
public ParquetMetadata getFooter() {
Preconditions.checkState(state == STATE.ENDED, "Cannot return unfinished footer.");
return footer;
}
/**
* Given a list of metadata files, merge them into a single ParquetMetadata
* Requires that the schemas be compatible, and the extraMetadata be exactly equal.
* @param files a list of files to merge metadata from
* @param conf a configuration
* @return merged parquet metadata for the files
* @throws IOException if there is an error while writing
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List files, Configuration conf) throws IOException {
return mergeMetadataFiles(files, conf, new StrictKeyValueMetadataMergeStrategy());
}
/**
* Given a list of metadata files, merge them into a single ParquetMetadata
* Requires that the schemas be compatible, and the extraMetadata be exactly equal.
* @param files a list of files to merge metadata from
* @param conf a configuration
* @param keyValueMetadataMergeStrategy strategy to merge values for same key, if there are multiple
* @return merged parquet metadata for the files
* @throws IOException if there is an error while writing
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List files, Configuration conf,
KeyValueMetadataMergeStrategy keyValueMetadataMergeStrategy) throws IOException {
Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");
GlobalMetaData globalMetaData = null;
List blocks = new ArrayList();
for (Path p : files) {
ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
FileMetaData fmd = pmd.getFileMetaData();
globalMetaData = mergeInto(fmd, globalMetaData, true);
blocks.addAll(pmd.getBlocks());
}
// collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
return new ParquetMetadata(globalMetaData.merge(keyValueMetadataMergeStrategy), blocks);
}
/**
* Given a list of metadata files, merge them into a single metadata file.
* Requires that the schemas be compatible, and the extraMetaData be exactly equal.
* This is useful when merging 2 directories of parquet files into a single directory, as long
* as both directories were written with compatible schemas and equal extraMetaData.
* @param files a list of files to merge metadata from
* @param outputPath path to write merged metadata to
* @param conf a configuration
* @throws IOException if there is an error while reading or writing
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static void writeMergedMetadataFile(List files, Path outputPath, Configuration conf) throws IOException {
ParquetMetadata merged = mergeMetadataFiles(files, conf);
writeMetadataFile(outputPath, merged, outputPath.getFileSystem(conf));
}
/**
* writes a _metadata and _common_metadata file
* @param configuration the configuration to use to get the FileSystem
* @param outputPath the directory to write the _metadata file to
* @param footers the list of footers to merge
* @throws IOException if there is an error while writing
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static void writeMetadataFile(Configuration configuration, Path outputPath, List footers) throws IOException {
writeMetadataFile(configuration, outputPath, footers, JobSummaryLevel.ALL);
}
/**
* writes _common_metadata file, and optionally a _metadata file depending on the {@link JobSummaryLevel} provided
* @param configuration the configuration to use to get the FileSystem
* @param outputPath the directory to write the _metadata file to
* @param footers the list of footers to merge
* @param level level of summary to write
* @throws IOException if there is an error while writing
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static void writeMetadataFile(Configuration configuration, Path outputPath, List footers, JobSummaryLevel level) throws IOException {
Preconditions.checkArgument(level == JobSummaryLevel.ALL || level == JobSummaryLevel.COMMON_ONLY,
"Unsupported level: " + level);
FileSystem fs = outputPath.getFileSystem(configuration);
outputPath = outputPath.makeQualified(fs);
ParquetMetadata metadataFooter = mergeFooters(outputPath, footers);
if (level == JobSummaryLevel.ALL) {
writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_METADATA_FILE);
}
metadataFooter.getBlocks().clear();
writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_COMMON_METADATA_FILE);
}
/**
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
private static void writeMetadataFile(Path outputPathRoot, ParquetMetadata metadataFooter, FileSystem fs, String parquetMetadataFile)
throws IOException {
Path metaDataPath = new Path(outputPathRoot, parquetMetadataFile);
writeMetadataFile(metaDataPath, metadataFooter, fs);
}
/**
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
private static void writeMetadataFile(Path outputPath, ParquetMetadata metadataFooter, FileSystem fs)
throws IOException {
PositionOutputStream metadata = HadoopStreams.wrap(fs.create(outputPath));
metadata.write(MAGIC);
serializeFooter(metadataFooter, metadata, null);
metadata.close();
}
/**
* Will merge the metadata of all the footers together
* @param root the directory containing all footers
* @param footers the list files footers to merge
* @return the global meta data for all the footers
*/
static ParquetMetadata mergeFooters(Path root, List footers) {
return mergeFooters(root, footers, new StrictKeyValueMetadataMergeStrategy());
}
/**
* Will merge the metadata of all the footers together
* @param root the directory containing all footers
* @param footers the list files footers to merge
* @param keyValueMergeStrategy strategy to merge values for a given key (if there are multiple values)
* @return the global meta data for all the footers
*/
static ParquetMetadata mergeFooters(Path root, List footers, KeyValueMetadataMergeStrategy keyValueMergeStrategy) {
String rootPath = root.toUri().getPath();
GlobalMetaData fileMetaData = null;
List blocks = new ArrayList();
for (Footer footer : footers) {
String footerPath = footer.getFile().toUri().getPath();
if (!footerPath.startsWith(rootPath)) {
throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
}
footerPath = footerPath.substring(rootPath.length());
while (footerPath.startsWith("/")) {
footerPath = footerPath.substring(1);
}
fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
block.setPath(footerPath);
blocks.add(block);
}
}
return new ParquetMetadata(fileMetaData.merge(keyValueMergeStrategy), blocks);
}
/**
* @return the current position in the underlying file
* @throws IOException if there is an error while getting the current stream's position
*/
public long getPos() throws IOException {
return out.getPos();
}
public long getNextRowGroupSize() throws IOException {
return alignment.nextRowGroupSize(out);
}
/**
* Will merge the metadata of all the footers together
* @param footers the list files footers to merge
* @return the global meta data for all the footers
*/
static GlobalMetaData getGlobalMetaData(List footers) {
return getGlobalMetaData(footers, true);
}
static GlobalMetaData getGlobalMetaData(List footers, boolean strict) {
GlobalMetaData fileMetaData = null;
for (Footer footer : footers) {
ParquetMetadata currentMetadata = footer.getParquetMetadata();
fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict);
}
return fileMetaData;
}
/**
* Will return the result of merging toMerge into mergedMetadata
* @param toMerge the metadata toMerge
* @param mergedMetadata the reference metadata to merge into
* @return the result of the merge
*/
static GlobalMetaData mergeInto(
FileMetaData toMerge,
GlobalMetaData mergedMetadata) {
return mergeInto(toMerge, mergedMetadata, true);
}
static GlobalMetaData mergeInto(
FileMetaData toMerge,
GlobalMetaData mergedMetadata,
boolean strict) {
MessageType schema = null;
Map> newKeyValues = new HashMap>();
Set createdBy = new HashSet();
if (mergedMetadata != null) {
schema = mergedMetadata.getSchema();
newKeyValues.putAll(mergedMetadata.getKeyValueMetaData());
createdBy.addAll(mergedMetadata.getCreatedBy());
}
if ((schema == null && toMerge.getSchema() != null)
|| (schema != null && !schema.equals(toMerge.getSchema()))) {
schema = mergeInto(toMerge.getSchema(), schema, strict);
}
for (Entry entry : toMerge.getKeyValueMetaData().entrySet()) {
Set values = newKeyValues.get(entry.getKey());
if (values == null) {
values = new LinkedHashSet();
newKeyValues.put(entry.getKey(), values);
}
values.add(entry.getValue());
}
createdBy.add(toMerge.getCreatedBy());
return new GlobalMetaData(
schema,
newKeyValues,
createdBy);
}
/**
* will return the result of merging toMerge into mergedSchema
* @param toMerge the schema to merge into mergedSchema
* @param mergedSchema the schema to append the fields to
* @return the resulting schema
*/
static MessageType mergeInto(MessageType toMerge, MessageType mergedSchema) {
return mergeInto(toMerge, mergedSchema, true);
}
/**
* will return the result of merging toMerge into mergedSchema
* @param toMerge the schema to merge into mergedSchema
* @param mergedSchema the schema to append the fields to
* @param strict should schema primitive types match
* @return the resulting schema
*/
static MessageType mergeInto(MessageType toMerge, MessageType mergedSchema, boolean strict) {
if (mergedSchema == null) {
return toMerge;
}
return mergedSchema.union(toMerge, strict);
}
private interface AlignmentStrategy {
void alignForRowGroup(PositionOutputStream out) throws IOException;
long nextRowGroupSize(PositionOutputStream out) throws IOException;
}
private static class NoAlignment implements AlignmentStrategy {
public static NoAlignment get(long rowGroupSize) {
return new NoAlignment(rowGroupSize);
}
private final long rowGroupSize;
private NoAlignment(long rowGroupSize) {
this.rowGroupSize = rowGroupSize;
}
@Override
public void alignForRowGroup(PositionOutputStream out) {
}
@Override
public long nextRowGroupSize(PositionOutputStream out) {
return rowGroupSize;
}
}
/**
* Alignment strategy that pads when less than half the row group size is
* left before the next DFS block.
*/
private static class PaddingAlignment implements AlignmentStrategy {
private static final byte[] zeros = new byte[4096];
public static PaddingAlignment get(long dfsBlockSize, long rowGroupSize,
int maxPaddingSize) {
return new PaddingAlignment(dfsBlockSize, rowGroupSize, maxPaddingSize);
}
protected final long dfsBlockSize;
protected final long rowGroupSize;
protected final int maxPaddingSize;
private PaddingAlignment(long dfsBlockSize, long rowGroupSize,
int maxPaddingSize) {
this.dfsBlockSize = dfsBlockSize;
this.rowGroupSize = rowGroupSize;
this.maxPaddingSize = maxPaddingSize;
}
@Override
public void alignForRowGroup(PositionOutputStream out) throws IOException {
long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize);
if (isPaddingNeeded(remaining)) {
LOG.debug("Adding {} bytes of padding (row group size={}B, block size={}B)", remaining, rowGroupSize, dfsBlockSize);
for (; remaining > 0; remaining -= zeros.length) {
out.write(zeros, 0, (int) Math.min((long) zeros.length, remaining));
}
}
}
@Override
public long nextRowGroupSize(PositionOutputStream out) throws IOException {
if (maxPaddingSize <= 0) {
return rowGroupSize;
}
long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize);
if (isPaddingNeeded(remaining)) {
return rowGroupSize;
}
return Math.min(remaining, rowGroupSize);
}
protected boolean isPaddingNeeded(long remaining) {
return (remaining <= maxPaddingSize);
}
}
}